pax_global_header00006660000000000000000000000064150213344670014517gustar00rootroot0000000000000052 comment=b35605ace3ddf7c1a5d67a2eb553f034aef41d55 x264-master/000077500000000000000000000000001502133446700130575ustar00rootroot00000000000000x264-master/.gitignore000066400000000000000000000005651502133446700150550ustar00rootroot00000000000000*~ *.a *.d *.diff *.orig *.rej *.dll* *.exe *.def *.lib *.pdb *.mo *.o *.patch *.pc *.pot *.so* *.dylib .*.swp .depend .DS_Store TAGS config.h config.mak config.log x264_config.h x264 checkasm *.264 *.h264 *.2pass *.ffindex *.avs *.mkv *.flv *.mp4 *.y4m *.yuv *.log *.mbtree *.temp *.pyc *.pgd *.pgc .digress_x264 dataDec.txt log.dec common/oclobj.h x264_lookahead.clbin x264-master/.gitlab-ci.yml000066400000000000000000000241721502133446700155210ustar00rootroot00000000000000stages: - build - test - release .variables-debian-amd64: &variables-debian-amd64 _TRIPLET: "" _PLATFORMSUFFIX: "" _WRAPPER: "" .variables-debian-aarch64: &variables-debian-aarch64 _TRIPLET: "" _PLATFORMSUFFIX: "" _WRAPPER: "" .variables-win32: &variables-win32 _TRIPLET: "i686-w64-mingw32" _ARCH: "i686" _OS: "mingw32" _PLATFORMSUFFIX: ".exe" _WRAPPER: "wine" .variables-win64: &variables-win64 _TRIPLET: "x86_64-w64-mingw32" _ARCH: "x86_64" _OS: "mingw32" _PLATFORMSUFFIX: ".exe" _WRAPPER: "wine" .variables-win-armv7: &variables-win-armv7 _TRIPLET: "armv7-w64-mingw32" _PLATFORMSUFFIX: ".exe" _WRAPPER: "" .variables-win-aarch64: &variables-win-aarch64 _TRIPLET: "aarch64-w64-mingw32" _PLATFORMSUFFIX: ".exe" _WRAPPER: "" .variables-macos-x86_64: &variables-macos-x86_64 _TRIPLET: "x86_64-apple-darwin19" _ARCH: "x86_64" _OS: "darwin" _PLATFORMSUFFIX: "" _WRAPPER: "" _XCFLAGS: "-arch x86_64" _XLDFLAGS: "-arch x86_64" _BIN_PATH: /Users/videolanci/sandbox/bin .variables-macos-arm64: &variables-macos-arm64 _TRIPLET: "aarch64-apple-darwin19" _ARCH: "aarch64" _OS: "darwin" _PLATFORMSUFFIX: "" _WRAPPER: "" _XCFLAGS: "-arch arm64" _XLDFLAGS: "-arch arm64" _BIN_PATH: /Users/videolanci/sandbox/bin .variables-android-arm: &variables-android-arm _TRIPLET: "arm-linux-androideabi" _CLANG_TRIPLET: "armv7a-linux-androideabi" _ANDROID_VERSION: "21" _PLATFORMSUFFIX: "" _WRAPPER: "" .variables-android-aarch64: &variables-android-aarch64 _TRIPLET: "aarch64-linux-android" _CLANG_TRIPLET: "aarch64-linux-android" _ANDROID_VERSION: "21" _PLATFORMSUFFIX: "" _WRAPPER: "" .build: stage: build script: | set -x LOCAL_INSTALL_DIR=`pwd`/local_install export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg cd ffmpeg ./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers --extra-ldflags="-static" make -j$(getconf _NPROCESSORS_ONLN) make -j$(getconf _NPROCESSORS_ONLN) install cd .. git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash cd lsmash ./configure --prefix="${LOCAL_INSTALL_DIR}" --extra-ldflags="-static" make -j$(getconf _NPROCESSORS_ONLN) make -j$(getconf _NPROCESSORS_ONLN) install cd .. ./configure --enable-pic --enable-strip --extra-ldflags="-static" make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm artifacts: name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA" paths: - x264${_PLATFORMSUFFIX} - checkasm8${_PLATFORMSUFFIX} - checkasm10${_PLATFORMSUFFIX} - config.log expire_in: 1 week build-debian-amd64: extends: .build image: registry.videolan.org/vlc-debian-unstable:20240212151604 tags: - docker - amd64 variables: *variables-debian-amd64 build-debian-aarch64: extends: .build image: registry.videolan.org/x264-debian-unstable-aarch64:20211206141032 tags: - docker - aarch64 variables: *variables-debian-aarch64 .build-win: extends: build-debian-amd64 image: registry.videolan.org/vlc-debian-llvm-msvcrt:20240212151604 script: | set -x LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET} export PKGCONFIG=pkg-config export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg cd ffmpeg ./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --cross-prefix="${_TRIPLET}-" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers make -j$(getconf _NPROCESSORS_ONLN) make -j$(getconf _NPROCESSORS_ONLN) install cd .. git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash cd lsmash ./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" make -j$(getconf _NPROCESSORS_ONLN) make -j$(getconf _NPROCESSORS_ONLN) install cd .. ./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm build-win32: extends: .build-win variables: *variables-win32 build-win64: extends: .build-win variables: *variables-win64 .build-llvm-mingw: extends: .build image: registry.videolan.org/vlc-debian-llvm-ucrt:20240212151604 tags: - docker - amd64 script: | set -x PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --cross-prefix="${_TRIPLET}-" --enable-pic --enable-strip make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm build-llvm-mingw-armv7: extends: .build-llvm-mingw variables: *variables-win-armv7 build-llvm-mingw-aarch64: extends: .build-llvm-mingw variables: *variables-win-aarch64 .build-macos: extends: .build script: | set -x export PATH="${_BIN_PATH}:$PATH" LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET} export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig git clone --depth 1 --branch master https://git.ffmpeg.org/ffmpeg.git ffmpeg cd ffmpeg ./configure --prefix="${LOCAL_INSTALL_DIR}" --enable-cross-compile --arch="${_ARCH}" --target-os="${_OS}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}" --enable-pic --disable-debug --disable-programs --disable-doc --disable-avdevice --disable-avfilter --disable-network --disable-encoders --disable-muxers make -j$(getconf _NPROCESSORS_ONLN) make -j$(getconf _NPROCESSORS_ONLN) install cd .. git clone --depth 1 --branch master https://github.com/l-smash/l-smash.git lsmash cd lsmash ./configure --prefix="${LOCAL_INSTALL_DIR}" --target-os="${_TRIPLET}" --extra-cflags="${_XCFLAGS}" --extra-ldflags="${_XLDFLAGS}" make -j$(getconf _NPROCESSORS_ONLN) make -j$(getconf _NPROCESSORS_ONLN) install cd .. ./configure --host="${_TRIPLET}" --enable-pic --enable-strip make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm build-macos-x86_64: extends: .build-macos tags: - amd64 - monterey variables: *variables-macos-x86_64 build-macos-arm64: extends: .build-macos tags: - amd64 - monterey variables: *variables-macos-arm64 .build-android: extends: .build image: registry.videolan.org/vlc-debian-android:20241118101328 tags: - docker - amd64 script: | set -x CC=${_CLANG_TRIPLET}${_ANDROID_VERSION}-clang AR=llvm-ar RANLIB=llvm-ranlib STRIP=llvm-strip PKGCONFIG=pkg-config ./configure --host="${_TRIPLET}" --enable-pic --enable-strip make -j$(getconf _NPROCESSORS_ONLN) x264 checkasm build-android-arm: extends: .build-android variables: *variables-android-arm build-android-aarch64: extends: .build-android variables: *variables-android-aarch64 .test: &test stage: test script: | set -x ${_WRAPPER} ./checkasm8${_PLATFORMSUFFIX} ${_WRAPPER} ./checkasm10${_PLATFORMSUFFIX} artifacts: expire_in: 10 minutes test-debian-amd64: <<: *test extends: build-debian-amd64 dependencies: - build-debian-amd64 variables: *variables-debian-amd64 test-debian-aarch64: <<: *test extends: build-debian-aarch64 dependencies: - build-debian-aarch64 variables: *variables-debian-aarch64 test-win32: <<: *test extends: build-win32 dependencies: - build-win32 variables: *variables-win32 test-win64: <<: *test extends: build-win64 dependencies: - build-win64 variables: *variables-win64 test-macos-x86_64: <<: *test extends: build-macos-x86_64 dependencies: - build-macos-x86_64 variables: *variables-macos-x86_64 test-aarch64-qemu: <<: *test extends: build-debian-amd64 image: registry.videolan.org/x264-debian-unstable:20231113190916 dependencies: - build-debian-aarch64 variables: *variables-debian-amd64 script: | set -x for size in 128 256 512 1024 2048; do for tool in checkasm8 checkasm10; do qemu-aarch64 -cpu max,sve-default-vector-length=256,sve$size=on -L /usr/aarch64-linux-gnu ./$tool done done .release: &release stage: release script: | set -x _VERSION=$(./version.sh | grep _VERSION -| cut -d\ -f4-| sed 's, ,-,g' | sed 's,",,') mv x264${_PLATFORMSUFFIX} x264-${_VERSION}${_PLATFORMSUFFIX} when: manual only: - master@videolan/x264 - stable@videolan/x264 artifacts: name: "$CI_PROJECT_PATH_SLUG-$CI_JOB_NAME-$CI_COMMIT_SHORT_SHA" paths: - x264-*${_PLATFORMSUFFIX} expire_in: '10 minutes' release-debian-amd64: <<: *release extends: build-debian-amd64 dependencies: - build-debian-amd64 variables: *variables-debian-amd64 release-debian-aarch64: <<: *release extends: build-debian-aarch64 dependencies: - build-debian-aarch64 variables: *variables-debian-aarch64 release-win32: <<: *release extends: build-win32 dependencies: - build-win32 variables: *variables-win32 release-win64: <<: *release extends: build-win64 dependencies: - build-win64 variables: *variables-win64 release-macos-x86_64: <<: *release extends: build-macos-x86_64 dependencies: - build-macos-x86_64 variables: *variables-macos-x86_64 release-macos-arm64: <<: *release extends: build-macos-arm64 dependencies: - build-macos-arm64 variables: *variables-macos-arm64 x264-master/AUTHORS000066400000000000000000000037171502133446700141370ustar00rootroot00000000000000# Contributors to x264 # # The format of this file was inspired by the Linux kernel CREDITS file. # Authors are listed alphabetically. # # The fields are: name (N), email (E), web-address (W), CVS account login (C), # PGP key ID and fingerprint (P), description (D), and snail-mail address (S). N: Alex Izvorski E: aizvorski AT gmail DOT com D: x86 asm (sse2) N: Alex Wright E: alexw0885 AT gmail DOT com D: Motion estimation (subpel and mixed refs) D: B-RDO N: bobololo D: Avisynth input D: MP4 muxing N: Christian Heine E: sennindemokrit AT gmx DOT net D: x86 asm N: David Wolstencroft D: Altivec optimizations N: Eric Petit E: eric.petit AT lapsus DOT org C: titer D: Altivec asm D: BeOS and MacOS X ports. S: France N: Fiona Glaser E: fiona AT x264 DOT com D: Maintainer D: All areas of encoder analysis and algorithms D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc D: x86 asm S: USA N: Gabriel Bouvigne E: bouvigne AT mp3-tech DOT org D: 2pass VBV N: Guillaume Poirier E: gpoirier CHEZ mplayerhq POINT hu D: Altivec optimizations S: Brittany, France N: Henrik Gramner E: henrik AT gramner DOT com D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes S: Sweden N: Laurent Aimar E: fenrir AT videolan DOT org C: fenrir D: Initial import, former maintainer D: x86 asm (mmx/mmx2) S: France N: Loren Merritt E: pengvado AT akuvian DOT org C: pengvado D: Maintainer D: All areas of encoder analysis and algorithms D: Motion estimation, rate control, macroblock & frame decisions, RDO, etc D: Multithreading D: x86 asm S: USA N: Mans Rullgard E: mru AT mansr DOT com C: mru D: Rate control S: Southampton, UK N: Michael Niedermayer E: michaelni AT gmx DOT at D: Rate control N: Mike Matsnev E: mike AT po DOT cs DOT msu DOT su D: Matroska muxing N: Min Chen E: chenm001 AT 163 DOT com C: chenm001 D: Win32/VC 6.0 port D: gcc asm to nasm conversion S: China N: Radek Czyz E: radoslaw AT syskin DOT cjb DOT net D: Cached motion compensation x264-master/COPYING000066400000000000000000000431101502133446700141110ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License. x264-master/Makefile000066400000000000000000000321211502133446700145160ustar00rootroot00000000000000# Makefile include config.mak vpath %.c $(SRCPATH) vpath %.h $(SRCPATH) vpath %.S $(SRCPATH) vpath %.asm $(SRCPATH) vpath %.rc $(SRCPATH) vpath %.manifest $(SRCPATH) CFLAGS += $(CFLAGSPROF) LDFLAGS += $(LDFLAGSPROF) GENERATED = all: default default: SRCS = common/osdep.c common/base.c common/cpu.c common/tables.c \ encoder/api.c SRCS_X = common/mc.c common/predict.c common/pixel.c common/macroblock.c \ common/frame.c common/dct.c common/cabac.c \ common/common.c common/rectangle.c \ common/set.c common/quant.c common/deblock.c common/vlc.c \ common/mvpred.c common/bitstream.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ encoder/cavlc.c encoder/encoder.c encoder/lookahead.c SRCS_8 = SRCCLI = x264.c autocomplete.c input/input.c input/timecode.c input/raw.c \ input/y4m.c output/raw.c output/matroska.c output/matroska_ebml.c \ output/flv.c output/flv_bytestream.c filters/filters.c \ filters/video/video.c filters/video/source.c filters/video/internal.c \ filters/video/resize.c filters/video/fix_vfr_pts.c \ filters/video/select_every.c filters/video/crop.c SRCCLI_X = filters/video/cache.c filters/video/depth.c SRCSO = SRCCHK_X = tools/checkasm.c SRCEXAMPLE = example.c OBJS = OBJASM = OBJSO = OBJCLI = OBJCHK = OBJCHK_8 = OBJCHK_10 = OBJEXAMPLE = CONFIG := $(shell cat config.h) # Optional module sources ifneq ($(findstring HAVE_AVS 1, $(CONFIG)),) SRCCLI += input/avs.c endif ifneq ($(findstring HAVE_THREAD 1, $(CONFIG)),) SRCS_X += common/threadpool.c SRCCLI_X += input/thread.c endif ifneq ($(findstring HAVE_WIN32THREAD 1, $(CONFIG)),) SRCS += common/win32thread.c endif ifneq ($(findstring HAVE_LAVF 1, $(CONFIG)),) SRCCLI += input/lavf.c endif ifneq ($(findstring HAVE_FFMS 1, $(CONFIG)),) SRCCLI += input/ffms.c endif ifneq ($(findstring HAVE_GPAC 1, $(CONFIG)),) SRCCLI += output/mp4.c endif ifneq ($(findstring HAVE_LSMASH 1, $(CONFIG)),) SRCCLI += output/mp4_lsmash.c endif ifneq ($(AS),) # MMX/SSE optims SRCASM_X = ifeq ($(SYS_ARCH),X86) ARCH_X86 = yes SRCASM_X += common/x86/dct-32.asm \ common/x86/pixel-32.asm endif ifeq ($(SYS_ARCH),X86_64) ARCH_X86 = yes SRCASM_X += common/x86/dct-64.asm \ common/x86/trellis-64.asm endif ifdef ARCH_X86 SRCASM_X += common/x86/bitstream-a.asm \ common/x86/const-a.asm \ common/x86/cabac-a.asm \ common/x86/dct-a.asm \ common/x86/deblock-a.asm \ common/x86/mc-a.asm \ common/x86/mc-a2.asm \ common/x86/pixel-a.asm \ common/x86/predict-a.asm \ common/x86/quant-a.asm SRCS_X += common/x86/mc-c.c \ common/x86/predict-c.c OBJASM += common/x86/cpu-a.o ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.asm=%-8.o) common/x86/sad-a-8.o endif ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.asm=%-10.o) common/x86/sad16-a-10.o endif OBJCHK += tools/checkasm-a.o endif # AltiVec optims ifeq ($(SYS_ARCH),PPC) SRCS_X += common/ppc/dct.c \ common/ppc/deblock.c \ common/ppc/mc.c \ common/ppc/pixel.c \ common/ppc/predict.c \ common/ppc/quant.c endif # NEON optims ifeq ($(SYS_ARCH),ARM) SRCASM_X = common/arm/bitstream-a.S \ common/arm/dct-a.S \ common/arm/deblock-a.S \ common/arm/mc-a.S \ common/arm/pixel-a.S \ common/arm/predict-a.S \ common/arm/quant-a.S SRCS_X += common/arm/mc-c.c \ common/arm/predict-c.c OBJASM += common/arm/cpu-a.o ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.S=%-8.o) endif ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.S=%-10.o) endif OBJCHK += tools/checkasm-arm.o endif # AArch64 NEON and SVE/SVE2 optims ifeq ($(SYS_ARCH),AARCH64) SRCASM_X = common/aarch64/bitstream-a.S \ common/aarch64/cabac-a.S \ common/aarch64/dct-a.S \ common/aarch64/deblock-a.S \ common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ common/aarch64/predict-a.S \ common/aarch64/quant-a.S ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),) SRCASM_X += common/aarch64/dct-a-sve.S \ common/aarch64/deblock-a-sve.S \ common/aarch64/mc-a-sve.S \ common/aarch64/pixel-a-sve.S endif ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),) SRCASM_X += common/aarch64/dct-a-sve2.S endif SRCS_X += common/aarch64/asm-offsets.c \ common/aarch64/mc-c.c \ common/aarch64/predict-c.c OBJASM += ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.S=%-8.o) endif ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.S=%-10.o) endif OBJCHK += tools/checkasm-aarch64.o endif # MSA optims ifeq ($(SYS_ARCH),MIPS) ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),) SRCS_X += common/mips/dct-c.c \ common/mips/deblock-c.c \ common/mips/mc-c.c \ common/mips/pixel-c.c \ common/mips/predict-c.c \ common/mips/quant-c.c endif endif # LOONGARCH optimization ifeq ($(SYS_ARCH),LOONGARCH) ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),) SRCASM_X += common/loongarch/deblock-a.S \ common/loongarch/sad-a.S \ common/loongarch/predict-a.S \ common/loongarch/quant-a.S \ common/loongarch/mc-a.S \ common/loongarch/dct-a.S \ common/loongarch/pixel-a.S SRCS_X += common/loongarch/predict-c.c \ common/loongarch/mc-c.c \ common/loongarch/pixel-c.c OBJASM += ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.S=%-8.o) endif ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),) OBJASM += $(SRCASM_X:%.S=%-10.o) endif OBJCHK += tools/checkasm-loongarch.o endif endif endif ifneq ($(HAVE_GETOPT_LONG),1) SRCCLI += extras/getopt.c endif ifeq ($(SYS),WINDOWS) OBJCLI += $(if $(RC), x264res.o) ifneq ($(SONAME),) SRCSO += x264dll.c OBJSO += $(if $(RC), x264res.dll.o) endif endif ifeq ($(HAVE_OPENCL),yes) common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl) cat $^ | $(SRCPATH)/tools/cltostr.sh $@ GENERATED += common/oclobj.h SRCS_8 += common/opencl.c encoder/slicetype-cl.c endif OBJS += $(SRCS:%.c=%.o) OBJCLI += $(SRCCLI:%.c=%.o) OBJSO += $(SRCSO:%.c=%.o) OBJEXAMPLE += $(SRCEXAMPLE:%.c=%.o) ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),) OBJS += $(SRCS_X:%.c=%-8.o) $(SRCS_8:%.c=%-8.o) OBJCLI += $(SRCCLI_X:%.c=%-8.o) OBJCHK_8 += $(SRCCHK_X:%.c=%-8.o) checkasm: checkasm8$(EXE) endif ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),) OBJS += $(SRCS_X:%.c=%-10.o) OBJCLI += $(SRCCLI_X:%.c=%-10.o) OBJCHK_10 += $(SRCCHK_X:%.c=%-10.o) checkasm: checkasm10$(EXE) endif .PHONY: all default fprofiled clean distclean install install-* uninstall cli lib-* checkasm etags cli: x264$(EXE) lib-static: $(LIBX264) lib-shared: $(SONAME) $(LIBX264): $(OBJS) $(OBJASM) rm -f $(LIBX264) $(AR)$@ $(OBJS) $(OBJASM) $(if $(RANLIB), $(RANLIB) $@) $(SONAME): $(OBJS) $(OBJASM) $(OBJSO) $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS) $(IMPLIBNAME): $(SONAME) ifneq ($(EXE),) .PHONY: x264 checkasm8 checkasm10 example x264: x264$(EXE) checkasm8: checkasm8$(EXE) checkasm10: checkasm10$(EXE) example: example$(EXE) endif x264$(EXE): $(OBJCLI) $(CLI_LIBX264) $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS) checkasm8$(EXE): $(OBJCHK) $(OBJCHK_8) $(LIBX264) $(LD)$@ $(OBJCHK) $(OBJCHK_8) $(LIBX264) $(LDFLAGS) checkasm10$(EXE): $(OBJCHK) $(OBJCHK_10) $(LIBX264) $(LD)$@ $(OBJCHK) $(OBJCHK_10) $(LIBX264) $(LDFLAGS) example$(EXE): $(OBJEXAMPLE) $(LIBX264) $(LD)$@ $(OBJEXAMPLE) $(LIBX264) $(LDFLAGS) $(OBJS) $(OBJSO): CFLAGS += $(CFLAGSSO) $(OBJCLI): CFLAGS += $(CFLAGSCLI) ALLOBJS = $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10) $(OBJEXAMPLE) $(ALLOBJS): $(GENERATED) %.o: %.c $(DEPCMD) $(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) %-8.o: %.c $(DEPCMD) $(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 %-10.o: %.c $(DEPCMD) $(CC) $(CFLAGS) -c $< $(CC_O) $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 %.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm $(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %-8.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm $(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=8 -Dprivate_prefix=x264_8 -@ $(if $(STRIP), $(STRIP) -x $@) %-10.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm $(AS) $(ASFLAGS) -o $@ $< -MD $(@:.o=.d) -DBIT_DEPTH=10 -Dprivate_prefix=x264_10 -@ $(if $(STRIP), $(STRIP) -x $@) %.o: %.S $(DEPCMD) $(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile %-8.o: %.S $(DEPCMD) $(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -@ $(if $(STRIP), $(STRIP) -x $@) %-10.o: %.S $(DEPCMD) $(AS) $(ASFLAGS) -o $@ $< $(DEPFLAGS) -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -@ $(if $(STRIP), $(STRIP) -x $@) %.dll.o: %.rc x264.h $(RC) $(RCFLAGS)$@ -DDLL $< %.o: %.rc x264.h x264res.manifest $(RC) $(RCFLAGS)$@ $< config.mak: ./configure # This is kept as a no-op depend: @echo "make depend" is handled implicitly now -include $(wildcard $(ALLOBJS:.o=.d)) # Dummy rule to avoid failing, if the dependency files specify dependencies on # a removed .h file. %.h: @: OBJPROF = $(OBJS) $(OBJSO) $(OBJCLI) # These should cover most of the important codepaths OPT0 = --crf 30 -b1 -m1 -r1 --me dia --no-cabac --direct temporal --ssim --no-weightb OPT1 = --crf 16 -b2 -m3 -r3 --me hex --no-8x8dct --direct spatial --no-dct-decimate -t0 --slice-max-mbs 50 OPT2 = --crf 26 -b4 -m5 -r2 --me hex --cqm jvt --nr 100 --psnr --no-mixed-refs --b-adapt 2 --slice-max-size 1500 OPT3 = --crf 18 -b3 -m9 -r5 --me umh -t1 -A all --b-pyramid normal --direct auto --no-fast-pskip --no-mbtree OPT4 = --crf 22 -b3 -m7 -r4 --me esa -t2 -A all --psy-rd 1.0:1.0 --slices 4 OPT5 = --frames 50 --crf 24 -b3 -m10 -r3 --me tesa -t2 OPT6 = --frames 50 -q0 -m9 -r2 --me hex -Aall OPT7 = --frames 50 -q0 -m2 -r1 --me hex --no-cabac ifeq (,$(VIDS)) fprofiled: @echo 'usage: make fprofiled VIDS="infile1 infile2 ..."' @echo 'where infiles are anything that x264 understands,' @echo 'i.e. YUV with resolution in the filename, y4m, or avisynth.' else fprofiled: clean $(MAKE) x264$(EXE) CFLAGSPROF="$(PROF_GEN_CC)" LDFLAGSPROF="$(PROF_GEN_LD)" $(foreach V, $(VIDS), $(foreach I, 0 1 2 3 4 5 6 7, ./x264$(EXE) $(OPT$I) --threads 1 $(V) -o $(DEVNULL) ;)) ifeq ($(COMPILER),CL) # Because Visual Studio timestamps the object files within the PGD, it fails to build if they change - only the executable should be deleted rm -f x264$(EXE) else rm -f $(OBJPROF) endif $(MAKE) CFLAGSPROF="$(PROF_USE_CC)" LDFLAGSPROF="$(PROF_USE_LD)" rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc endif clean: rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(GENERATED) TAGS rm -f $(SONAME) *.a *.lib *.exp *.pdb x264$(EXE) x264_lookahead.clbin rm -f checkasm8$(EXE) checkasm10$(EXE) $(OBJCHK) $(OBJCHK_8) $(OBJCHK_10) rm -f example$(EXE) $(OBJEXAMPLE) rm -f $(OBJPROF:%.o=%.gcda) $(OBJPROF:%.o=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc rm -f $(ALLOBJS:%.o=%.d) distclean: clean rm -f config.mak x264_config.h config.h config.log x264.pc x264.def rm -rf conftest* install-cli: cli $(INSTALL) -d $(DESTDIR)$(bindir) $(INSTALL) x264$(EXE) $(DESTDIR)$(bindir) install-lib-dev: $(INSTALL) -d $(DESTDIR)$(includedir) $(INSTALL) -d $(DESTDIR)$(libdir)/pkgconfig $(INSTALL) -m 644 $(SRCPATH)/x264.h x264_config.h $(DESTDIR)$(includedir) $(INSTALL) -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig install-lib-static: lib-static install-lib-dev $(INSTALL) -d $(DESTDIR)$(libdir) $(INSTALL) -m 644 $(LIBX264) $(DESTDIR)$(libdir) $(if $(RANLIB), $(RANLIB) $(DESTDIR)$(libdir)/$(LIBX264)) install-lib-shared: lib-shared install-lib-dev $(INSTALL) -d $(DESTDIR)$(libdir) ifneq ($(IMPLIBNAME),) $(INSTALL) -d $(DESTDIR)$(bindir) $(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(bindir) $(INSTALL) -m 644 $(IMPLIBNAME) $(DESTDIR)$(libdir) else ifneq ($(SONAME),) ln -f -s $(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX) $(INSTALL) -m 755 $(SONAME) $(DESTDIR)$(libdir) endif install-bashcompletion: ifneq ($(BASHCOMPLETIONSDIR),) $(INSTALL) -d $(DESTDIR)$(BASHCOMPLETIONSDIR) $(INSTALL) -m 644 $(SRCPATH)/tools/bash-autocomplete.sh $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264 endif uninstall: rm -f $(DESTDIR)$(includedir)/x264.h $(DESTDIR)$(includedir)/x264_config.h $(DESTDIR)$(libdir)/libx264.a rm -f $(DESTDIR)$(bindir)/x264$(EXE) $(DESTDIR)$(libdir)/pkgconfig/x264.pc ifneq ($(IMPLIBNAME),) rm -f $(DESTDIR)$(bindir)/$(SONAME) $(DESTDIR)$(libdir)/$(IMPLIBNAME) else ifneq ($(SONAME),) rm -f $(DESTDIR)$(libdir)/$(SONAME) $(DESTDIR)$(libdir)/libx264.$(SOSUFFIX) endif ifneq ($(BASHCOMPLETIONSDIR),) rm -f $(DESTDIR)$(BASHCOMPLETIONSDIR)/x264 endif etags TAGS: etags $(SRCS) $(SRCS_X) $(SRCS_8) x264-master/autocomplete.c000066400000000000000000000246211502133446700157310ustar00rootroot00000000000000/***************************************************************************** * autocomplete: x264cli shell autocomplete ***************************************************************************** * Copyright (C) 2018-2025 x264 project * * Authors: Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "x264cli.h" #include "input/input.h" #if HAVE_LAVF #undef DECLARE_ALIGNED #include #include #endif static const char * const level_names[] = { "1", "1.1", "1.2", "1.3", "1b", "2", "2.1", "2.2", "3", "3.1", "3.2", "4", "4.1", "4.2", "5", "5.1", "5.2", "6", "6.1", "6.2", NULL }; /* Options requiring a value for which we provide suggestions. */ static const char * const opts_suggest[] = { "--alternative-transfer", "--aq-mode", "--asm", "--avcintra-class", "--avcintra-flavor", "--b-adapt", "--b-pyramid", "--colormatrix", "--colorprim", "--cqm", "--demuxer", "--direct", "--frame-packing", "--input-csp", "--input-fmt", "--input-range", "--level", "--log-level", "--me", "--muxer", "--nal-hrd", "--output-csp", "--overscan", "--pass", "-p", "--preset", "--profile", "--pulldown", "--range", "--subme", "-m", "--transfer", "--trellis", "-t", "--tune", "--videoformat", "--weightp", NULL }; /* Options requiring a value for which we don't provide suggestions. */ static const char * const opts_nosuggest[] = { "--b-bias", "--bframes", "-b", "--deblock", "-f", "--bitrate", "-B", "--chroma-qp-offset", "--chromaloc", "--cplxblur", "--cqm4", "--cqm4i", "--cqm4ic", "--cqm4iy", "--cqm4p", "--cqm4pc", "--cqm4py", "--cqm8", "--cqm8i", "--cqm8p", "--crf", "--crf-max", "--crop-rect", "--deadzone-inter", "--deadzone-intra", "--fps", "--frames", "--input-depth", "--input-res", "--ipratio", "--keyint", "-I", "--lookahead-threads", "--mastering-display", "--cll", "--merange", "--min-keyint", "-i", "--mvrange", "--mvrange-thread", "--nr", "--opencl-device", "--output-depth", "--partitions", "-A", "--pbratio", "--psy-rd", "--qblur", "--qcomp", "--qp", "-q", "--qpmax", "--qpmin", "--qpstep", "--ratetol", "--ref", "-r", "--rc-lookahead", "--sar", "--scenecut", "--seek", "--slices", "--slices-max", "--slice-max-size", "--slice-max-mbs", "--slice-min-mbs", "--sps-id", "--sync-lookahead", "--threads", "--timebase", "--vbv-bufsize", "--vbv-init", "--vbv-maxrate", "--video-filter", "--vf", "--zones", NULL }; /* Options requiring a filename. */ static const char * const opts_filename[] = { "--cqmfile", "--dump-yuv", "--index", "--opencl-clbin", "--output", "-o", "--qpfile", "--stats", "--tcfile-in", "--tcfile-out", NULL }; /* Options without an associated value. */ static const char * const opts_standalone[] = { "--8x8dct", "--aud", "--bff", "--bluray-compat", "--cabac", "--constrained-intra", "--cpu-independent", "--dts-compress", "--fake-interlaced", "--fast-pskip", "--filler", "--force-cfr", "--mbtree", "--mixed-refs", "--no-8x8dct", "--no-asm", "--no-cabac", "--no-chroma-me", "--no-dct-decimate", "--no-deblock", "--no-fast-pskip", "--no-mbtree", "--no-mixed-refs", "--no-progress", "--no-psy", "--no-scenecut", "--no-weightb", "--non-deterministic", "--open-gop", "--opencl", "--pic-struct", "--psnr", "--quiet", "--sliced-threads", "--slow-firstpass", "--ssim", "--stitchable", "--tff", "--thread-input", "--verbose", "-v", "--weightb", NULL }; /* Options which shouldn't be suggested in combination with other options. */ static const char * const opts_special[] = { "--fullhelp", "--help", "-h", "--longhelp", "--version", NULL }; static int list_contains( const char * const *list, const char *s ) { if( *s ) for( ; *list; list++ ) if( !strcmp( *list, s ) ) return 1; return 0; } static void suggest( const char *s, const char *cur, int cur_len ) { if( s && *s && !strncmp( s, cur, cur_len ) ) printf( "%s ", s ); } static void suggest_lower( const char *s, const char *cur, int cur_len ) { if( s && *s && !strncasecmp( s, cur, cur_len ) ) { for( ; *s; s++ ) putchar( *s < 'A' || *s > 'Z' ? *s : *s | 0x20 ); putchar( ' ' ); } } static void suggest_num_range( int start, int end, const char *cur, int cur_len ) { char buf[16]; for( int i = start; i <= end; i++ ) { snprintf( buf, sizeof( buf ), "%d", i ); suggest( buf, cur, cur_len ); } } #if HAVE_LAVF /* Suggest each token in a string separated by delimiters. */ static void suggest_token( const char *s, int delim, const char *cur, int cur_len ) { if( s && *s ) { for( const char *tok_end; (tok_end = strchr( s, delim )); s = tok_end + 1 ) { int tok_len = tok_end - s; if( tok_len && tok_len >= cur_len && !strncmp( s, cur, cur_len ) ) printf( "%.*s ", tok_len, s ); } suggest( s, cur, cur_len ); } } #endif #define OPT( opt ) else if( !strcmp( prev, opt ) ) #define OPT2( opt1, opt2 ) else if( !strcmp( prev, opt1 ) || !strcmp( prev, opt2 ) ) #define OPT_TYPE( type ) list_contains( opts_##type, prev ) #define suggest( s ) suggest( s, cur, cur_len ) #define suggest_lower( s ) suggest_lower( s, cur, cur_len ) #define suggest_list( list ) for( const char * const *s = list; *s; s++ ) suggest( *s ) #define suggest_num_range( start, end ) suggest_num_range( start, end, cur, cur_len ) #define suggest_token( s, delim ) suggest_token( s, delim, cur, cur_len ) int x264_cli_autocomplete( const char *prev, const char *cur ) { int cur_len = strlen( cur ); if( 0 ); OPT( "--alternative-transfer" ) suggest_list( x264_transfer_names ); OPT( "--aq-mode" ) suggest_num_range( 0, 3 ); OPT( "--asm" ) for( const x264_cpu_name_t *cpu = x264_cpu_names; cpu->flags; cpu++ ) suggest_lower( cpu->name ); OPT( "--avcintra-class" ) suggest_list( x264_avcintra_class_names ); OPT( "--avcintra-flavor" ) suggest_list( x264_avcintra_flavor_names ); OPT( "--b-adapt" ) suggest_num_range( 0, 2 ); OPT( "--b-pyramid" ) suggest_list( x264_b_pyramid_names ); OPT( "--colormatrix" ) suggest_list( x264_colmatrix_names ); OPT( "--colorprim" ) suggest_list( x264_colorprim_names ); OPT( "--cqm" ) suggest_list( x264_cqm_names ); OPT( "--demuxer" ) suggest_list( x264_demuxer_names ); OPT( "--direct" ) suggest_list( x264_direct_pred_names ); OPT( "--frame-packing" ) suggest_num_range( 0, 7 ); OPT( "--input-csp" ) { for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ ) suggest( x264_cli_csps[i].name ); #if HAVE_LAVF for( const AVPixFmtDescriptor *d = NULL; (d = av_pix_fmt_desc_next( d )); ) suggest( d->name ); #endif } OPT( "--input-fmt" ) { #if HAVE_LAVF void *i = NULL; for( const AVInputFormat *f; (f = av_demuxer_iterate( &i )); ) suggest_token( f->name, ',' ); #endif } OPT( "--input-range" ) suggest_list( x264_range_names ); OPT( "--level" ) suggest_list( level_names ); OPT( "--log-level" ) suggest_list( x264_log_level_names ); OPT( "--me" ) suggest_list( x264_motion_est_names ); OPT( "--muxer" ) suggest_list( x264_muxer_names ); OPT( "--nal-hrd" ) suggest_list( x264_nal_hrd_names ); OPT( "--output-csp" ) suggest_list( x264_output_csp_names ); OPT( "--output-depth" ) { #if HAVE_BITDEPTH8 suggest( "8" ); #endif #if HAVE_BITDEPTH10 suggest( "10" ); #endif } OPT( "--overscan" ) suggest_list( x264_overscan_names ); OPT2( "--partitions", "-A" ) suggest_list( x264_partition_names ); OPT2( "--pass", "-p" ) suggest_num_range( 1, 3 ); OPT( "--preset" ) suggest_list( x264_preset_names ); OPT( "--profile" ) suggest_list( x264_valid_profile_names ); OPT( "--pulldown" ) suggest_list( x264_pulldown_names ); OPT( "--range" ) suggest_list( x264_range_names ); OPT2( "--subme", "-m" ) suggest_num_range( 0, 11 ); OPT( "--transfer" ) suggest_list( x264_transfer_names ); OPT2( "--trellis", "-t" ) suggest_num_range( 0, 2 ); OPT( "--tune" ) suggest_list( x264_tune_names ); OPT( "--videoformat" ) suggest_list( x264_vidformat_names ); OPT( "--weightp" ) suggest_num_range( 0, 2 ); else if( !OPT_TYPE( nosuggest ) && !OPT_TYPE( special ) ) { if( OPT_TYPE( filename ) || strncmp( cur, "--", 2 ) ) return 1; /* Fall back to default shell filename autocomplete. */ /* Suggest options. */ suggest_list( opts_suggest ); suggest_list( opts_nosuggest ); suggest_list( opts_filename ); suggest_list( opts_standalone ); /* Only suggest special options if no other options have been specified. */ if( !*prev ) suggest_list( opts_special ); } putchar( '\n' ); return 0; } x264-master/common/000077500000000000000000000000001502133446700143475ustar00rootroot00000000000000x264-master/common/aarch64/000077500000000000000000000000001502133446700155775ustar00rootroot00000000000000x264-master/common/aarch64/asm-offsets.c000066400000000000000000000047161502133446700202020ustar00rootroot00000000000000/***************************************************************************** * asm-offsets.c: check asm offsets for aarch64 ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "asm-offsets.h" #define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1] #define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \ { \ STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \ } #define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \ { \ STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \ } X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW); X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE); X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE); X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING); X264_CHECK_OFFSET(x264_cabac_t, p_start, CABAC_P_START); X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P); X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END); X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED); X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE); // the aarch64 asm makes following additional assumptions about the x264_cabac_t // memory layout X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range); X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding); x264-master/common/aarch64/asm-offsets.h000066400000000000000000000032121502133446700201750ustar00rootroot00000000000000/***************************************************************************** * asm-offsets.h: asm offsets for aarch64 ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_ASM_OFFSETS_H #define X264_AARCH64_ASM_OFFSETS_H #define CABAC_I_LOW 0x00 #define CABAC_I_RANGE 0x04 #define CABAC_I_QUEUE 0x08 #define CABAC_I_BYTES_OUTSTANDING 0x0c #define CABAC_P_START 0x10 #define CABAC_P 0x18 #define CABAC_P_END 0x20 #define CABAC_F8_BITS_ENCODED 0x30 #define CABAC_STATE 0x34 #endif x264-master/common/aarch64/asm.S000066400000000000000000000215361502133446700165120ustar00rootroot00000000000000/***************************************************************************** * asm.S: AArch64 utility macros ***************************************************************************** * Copyright (C) 2008-2025 x264 project * * Authors: Mans Rullgard * David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "config.h" #define GLUE(a, b) a ## b #define JOIN(a, b) GLUE(a, b) #ifdef PREFIX # define BASE _x264_ # define SYM_PREFIX _ #else # define BASE x264_ # define SYM_PREFIX #endif #ifdef BIT_DEPTH # define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _) #else # define EXTERN_ASM BASE #endif #define X(s) JOIN(EXTERN_ASM, s) #define X264(s) JOIN(BASE, s) #define EXT(s) JOIN(SYM_PREFIX, s) #ifdef __ELF__ # define ELF #else # define ELF # #endif #ifdef __MACH__ # define MACH #else # define MACH # #endif #if HAVE_AS_FUNC # define FUNC #else # define FUNC # #endif .arch AS_ARCH_LEVEL #if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE #define ENABLE_DOTPROD .arch_extension dotprod #define DISABLE_DOTPROD .arch_extension nodotprod #else #define ENABLE_DOTPROD #define DISABLE_DOTPROD #endif #if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE #define ENABLE_I8MM .arch_extension i8mm #define DISABLE_I8MM .arch_extension noi8mm #else #define ENABLE_I8MM #define DISABLE_I8MM #endif #if HAVE_AS_ARCHEXT_SVE_DIRECTIVE #define ENABLE_SVE .arch_extension sve #define DISABLE_SVE .arch_extension nosve #else #define ENABLE_SVE #define DISABLE_SVE #endif #if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE #define ENABLE_SVE2 .arch_extension sve2 #define DISABLE_SVE2 .arch_extension nosve2 #else #define ENABLE_SVE2 #define DISABLE_SVE2 #endif /* If we do support the .arch_extension directives, disable support for all * the extensions that we may use, in case they were implicitly enabled by * the .arch level. This makes it clear if we try to assemble an instruction * from an unintended extension set; we only allow assmbling such instructions * within regions where we explicitly enable those extensions. */ DISABLE_DOTPROD DISABLE_I8MM DISABLE_SVE DISABLE_SVE2 .macro function name, export=0, align=2 .macro endfunc .if \export ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name .else ELF .size \name, . - \name .endif FUNC .endfunc .purgem endfunc .endm .text .align \align .if \export .global EXTERN_ASM\name ELF .type EXTERN_ASM\name, %function FUNC .func EXTERN_ASM\name EXTERN_ASM\name: .else ELF .type \name, %function FUNC .func \name \name: .endif .endm .macro const name, align=2 .macro endconst ELF .size \name, . - \name .purgem endconst .endm ELF .section .rodata MACH .const_data .align \align \name: .endm .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 adrp \rd, \val@PAGE add \rd, \rd, \val@PAGEOFF sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset)@PAGE add \rd, \rd, \val+(\offset)@PAGEOFF .endif #elif defined(PIC) && defined(_WIN32) .if \offset < 0 adrp \rd, \val add \rd, \rd, :lo12:\val sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) #else ldr \rd, =\val+\offset #endif .endm #define FDEC_STRIDE 32 #define FENC_STRIDE 16 .macro SUMSUB_AB sum, sub, a, b add \sum, \a, \b sub \sub, \a, \b .endm .macro unzip t1, t2, s1, s2 uzp1 \t1, \s1, \s2 uzp2 \t2, \s1, \s2 .endm .macro transpose t1, t2, s1, s2 trn1 \t1, \s1, \s2 trn2 \t2, \s1, \s2 .endm .macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3 transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h .endm .macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3 transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h .endm .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 trn1 \r8\().8h, \r0\().8h, \r1\().8h trn2 \r9\().8h, \r0\().8h, \r1\().8h trn1 \r1\().8h, \r2\().8h, \r3\().8h trn2 \r3\().8h, \r2\().8h, \r3\().8h trn1 \r0\().8h, \r4\().8h, \r5\().8h trn2 \r5\().8h, \r4\().8h, \r5\().8h trn1 \r2\().8h, \r6\().8h, \r7\().8h trn2 \r7\().8h, \r6\().8h, \r7\().8h trn1 \r4\().4s, \r0\().4s, \r2\().4s trn2 \r2\().4s, \r0\().4s, \r2\().4s trn1 \r6\().4s, \r5\().4s, \r7\().4s trn2 \r7\().4s, \r5\().4s, \r7\().4s trn1 \r5\().4s, \r9\().4s, \r3\().4s trn2 \r9\().4s, \r9\().4s, \r3\().4s trn1 \r3\().4s, \r8\().4s, \r1\().4s trn2 \r8\().4s, \r8\().4s, \r1\().4s trn1 \r0\().2d, \r3\().2d, \r4\().2d trn2 \r4\().2d, \r3\().2d, \r4\().2d trn1 \r1\().2d, \r5\().2d, \r6\().2d trn2 \r5\().2d, \r5\().2d, \r6\().2d trn2 \r6\().2d, \r8\().2d, \r2\().2d trn1 \r2\().2d, \r8\().2d, \r2\().2d trn1 \r3\().2d, \r9\().2d, \r7\().2d trn2 \r7\().2d, \r9\().2d, \r7\().2d .endm .macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 trn1 \t0\().16b, \r0\().16b, \r1\().16b trn2 \t1\().16b, \r0\().16b, \r1\().16b trn1 \r1\().16b, \r2\().16b, \r3\().16b trn2 \r3\().16b, \r2\().16b, \r3\().16b trn1 \r0\().16b, \r4\().16b, \r5\().16b trn2 \r5\().16b, \r4\().16b, \r5\().16b trn1 \r2\().16b, \r6\().16b, \r7\().16b trn2 \r7\().16b, \r6\().16b, \r7\().16b trn1 \r4\().8h, \r0\().8h, \r2\().8h trn2 \r2\().8h, \r0\().8h, \r2\().8h trn1 \r6\().8h, \r5\().8h, \r7\().8h trn2 \r7\().8h, \r5\().8h, \r7\().8h trn1 \r5\().8h, \t1\().8h, \r3\().8h trn2 \t1\().8h, \t1\().8h, \r3\().8h trn1 \r3\().8h, \t0\().8h, \r1\().8h trn2 \t0\().8h, \t0\().8h, \r1\().8h trn1 \r0\().4s, \r3\().4s, \r4\().4s trn2 \r4\().4s, \r3\().4s, \r4\().4s trn1 \r1\().4s, \r5\().4s, \r6\().4s trn2 \r5\().4s, \r5\().4s, \r6\().4s trn2 \r6\().4s, \t0\().4s, \r2\().4s trn1 \r2\().4s, \t0\().4s, \r2\().4s trn1 \r3\().4s, \t1\().4s, \r7\().4s trn2 \r7\().4s, \t1\().4s, \r7\().4s .endm .macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().16b, \r0\().16b, \r1\().16b trn2 \t5\().16b, \r0\().16b, \r1\().16b trn1 \t6\().16b, \r2\().16b, \r3\().16b trn2 \t7\().16b, \r2\().16b, \r3\().16b trn1 \r0\().8h, \t4\().8h, \t6\().8h trn2 \r2\().8h, \t4\().8h, \t6\().8h trn1 \r1\().8h, \t5\().8h, \t7\().8h trn2 \r3\().8h, \t5\().8h, \t7\().8h .endm .macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8b, \r0\().8b, \r1\().8b trn2 \t5\().8b, \r0\().8b, \r1\().8b trn1 \t6\().8b, \r2\().8b, \r3\().8b trn2 \t7\().8b, \r2\().8b, \r3\().8b trn1 \r0\().4h, \t4\().4h, \t6\().4h trn2 \r2\().4h, \t4\().4h, \t6\().4h trn1 \r1\().4h, \t5\().4h, \t7\().4h trn2 \r3\().4h, \t5\().4h, \t7\().4h .endm x264-master/common/aarch64/bitstream-a.S000066400000000000000000000051151502133446700201350ustar00rootroot00000000000000/***************************************************************************** * bitstream-a.S: aarch64 bitstream functions ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" function nal_escape_neon, export=1 movi v0.16b, #0xff movi v4.16b, #4 mov w3, #3 subs x6, x1, x2 cbz x6, 99f 0: cmn x6, #15 b.lt 16f mov x1, x2 b 100f 16: ld1 {v1.16b}, [x1], #16 ext v2.16b, v0.16b, v1.16b, #14 ext v3.16b, v0.16b, v1.16b, #15 cmhi v7.16b, v4.16b, v1.16b cmeq v5.16b, v2.16b, #0 cmeq v6.16b, v3.16b, #0 and v5.16b, v5.16b, v7.16b and v5.16b, v5.16b, v6.16b shrn v7.8b, v5.8h, #4 mov x7, v7.d[0] cbz x7, 16f mov x6, #-16 100: umov w5, v0.b[14] umov w4, v0.b[15] orr w5, w4, w5, lsl #8 101: ldrb w4, [x1, x6] orr w9, w4, w5, lsl #16 cmp w9, #3 b.hi 102f strb w3, [x0], #1 orr w5, w3, w5, lsl #8 102: adds x6, x6, #1 strb w4, [x0], #1 orr w5, w4, w5, lsl #8 b.lt 101b subs x6, x1, x2 lsr w9, w5, #8 mov v0.b[14], w9 mov v0.b[15], w5 b.lt 0b ret 16: subs x6, x1, x2 st1 {v1.16b}, [x0], #16 mov v0.16b, v1.16b b.lt 0b 99: ret endfunc x264-master/common/aarch64/bitstream.h000066400000000000000000000026371502133446700177520ustar00rootroot00000000000000/***************************************************************************** * bitstream.h: aarch64 bitstream functions ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_BITSTREAM_H #define X264_AARCH64_BITSTREAM_H #define x264_nal_escape_neon x264_template(nal_escape_neon) uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end ); #endif x264-master/common/aarch64/cabac-a.S000066400000000000000000000104651502133446700172000ustar00rootroot00000000000000/***************************************************************************** * cabac-a.S: aarch64 cabac ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "asm-offsets.h" // w11 holds x264_cabac_t.i_low // w12 holds x264_cabac_t.i_range function cabac_encode_decision_asm, export=1 add w10, w1, #CABAC_STATE ldrb w3, [x0, w10, uxtw] // i_state ldr w12, [x0, #CABAC_I_RANGE] movrel x8, X264(cabac_range_lps), -4 movrel x9, X264(cabac_transition) ubfx x4, x3, #1, #7 asr w5, w12, #6 add x8, x8, x4, lsl #2 orr w14, w2, w3, lsl #1 ldrb w4, [x8, w5, uxtw] // i_range_lps ldr w11, [x0, #CABAC_I_LOW] eor w6, w2, w3 // b ^ i_state ldrb w9, [x9, w14, uxtw] sub w12, w12, w4 add w7, w11, w12 tst w6, #1 // (b ^ i_state) & 1 csel w12, w4, w12, ne csel w11, w7, w11, ne strb w9, [x0, w10, uxtw] // i_state cabac_encode_renorm: ldr w2, [x0, #CABAC_I_QUEUE] clz w5, w12 sub w5, w5, #23 lsl w11, w11, w5 lsl w12, w12, w5 adds w2, w2, w5 b.ge cabac_putbyte stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range str w2, [x0, #CABAC_I_QUEUE] ret .align 5 cabac_putbyte: ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING] add w14, w2, #10 mov w13, #-1 sub w2, w2, #8 asr w4, w11, w14 // out lsl w13, w13, w14 subs w5, w4, #0xff bic w11, w11, w13 cinc w6, w6, eq b.eq 0f 1: ldr x7, [x0, #CABAC_P] asr w5, w4, #8 // carry ldurb w8, [x7, #-1] add w8, w8, w5 sub w5, w5, #1 sturb w8, [x7, #-1] cbz w6, 3f 2: subs w6, w6, #1 strb w5, [x7], #1 b.gt 2b 3: strb w4, [x7], #1 str x7, [x0, #CABAC_P] 0: stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding ret endfunc function cabac_encode_bypass_asm, export=1, align=5 ldr w12, [x0, #CABAC_I_RANGE] ldr w11, [x0, #CABAC_I_LOW] ldr w2, [x0, #CABAC_I_QUEUE] and w1, w1, w12 add w11, w1, w11, lsl #1 adds w2, w2, #1 b.ge cabac_putbyte str w11, [x0, #CABAC_I_LOW] str w2, [x0, #CABAC_I_QUEUE] ret endfunc function cabac_encode_terminal_asm, export=1, align=5 ldr w12, [x0, #CABAC_I_RANGE] sub w12, w12, #2 tbz w12, #8, 1f str w12, [x0, #CABAC_I_RANGE] ret 1: ldr w2, [x0, #CABAC_I_QUEUE] ldr w11, [x0, #CABAC_I_LOW] lsl w12, w12, #1 adds w2, w2, #1 lsl w11, w11, #1 b.ge cabac_putbyte stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range str w2, [x0, #CABAC_I_QUEUE] ret endfunc x264-master/common/aarch64/dct-a-common.S000066400000000000000000000032771502133446700202120ustar00rootroot00000000000000/**************************************************************************** * dct-a-common.S: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ // This file contains the NEON macros that are intended to be used by // the SVE/SVE2 functions as well .macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7 SUMSUB_AB \v1, \v6, \v5, \v6 SUMSUB_AB \v3, \v7, \v4, \v7 add \v0, \v3, \v1 add \v4, \v7, \v7 add \v5, \v6, \v6 sub \v2, \v3, \v1 add \v1, \v4, \v6 sub \v3, \v7, \v5 .endm x264-master/common/aarch64/dct-a-sve.S000066400000000000000000000062711502133446700175140ustar00rootroot00000000000000/**************************************************************************** * dct-a-sve.S: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "dct-a-common.S" ENABLE_SVE function sub4x4_dct_sve, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE ptrue p0.h, vl4 ld1b {z0.h}, p0/z, [x1] add x1, x1, x3 ld1b {z1.h}, p0/z, [x2] add x2, x2, x4 ld1b {z2.h}, p0/z, [x1] add x1, x1, x3 sub v16.4h, v0.4h, v1.4h ld1b {z3.h}, p0/z, [x2] add x2, x2, x4 ld1b {z4.h}, p0/z, [x1] add x1, x1, x3 sub v17.4h, v2.4h, v3.4h ld1b {z5.h}, p0/z, [x2] add x2, x2, x4 ld1b {z6.h}, p0/z, [x1] sub v18.4h, v4.4h, v5.4h ld1b {z7.h}, p0/z, [x2] sub v19.4h, v6.4h, v7.4h DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] ret endfunc function zigzag_interleave_8x8_cavlc_sve, export=1 mov z31.s, #1 ptrue p2.s, vl2 ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 umax v16.8h, v0.8h, v4.8h umax v17.8h, v1.8h, v5.8h umax v18.8h, v2.8h, v6.8h umax v19.8h, v3.8h, v7.8h st1 {v0.8h}, [x0], #16 st1 {v4.8h}, [x0], #16 umaxp v16.8h, v16.8h, v17.8h umaxp v18.8h, v18.8h, v19.8h st1 {v1.8h}, [x0], #16 st1 {v5.8h}, [x0], #16 umaxp v16.8h, v16.8h, v18.8h st1 {v2.8h}, [x0], #16 st1 {v6.8h}, [x0], #16 cmhs v16.4s, v16.4s, v31.4s st1 {v3.8h}, [x0], #16 and v16.16b, v16.16b, v31.16b st1 {v7.8h}, [x0], #16 st1b {z16.s}, p2, [x2] add x2, x2, #8 mov v16.d[0], v16.d[1] st1b {z16.s}, p2, [x2] ret endfunc x264-master/common/aarch64/dct-a-sve2.S000066400000000000000000000057011502133446700175730ustar00rootroot00000000000000/**************************************************************************** * dct-a-sve2.S: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "dct-a-common.S" ENABLE_SVE ENABLE_SVE2 function add4x4_idct_sve2, export=1 mov x2, #FDEC_STRIDE mov x11, x0 ptrue p0.h, vl8 ptrue p1.h, vl4 ld1 {v0.8h, v1.8h}, [x1] SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h sshr v7.8h, v0.8h, #1 sshr v6.8h, v1.8h, #1 sub v7.8h, v7.8h, v1.8h add v6.8h, v6.8h, v0.8h mov v7.d[0], v7.d[1] mov v6.d[0], v6.d[1] ld1b {z28.h}, p0/z, [x11] add x11, x11, x2 SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19 SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h sshr v7.4h, v1.4h, #1 sshr v6.4h, v2.4h, #1 sub v7.4h, v7.4h, v2.4h add v6.4h, v6.4h, v1.4h ld1b {z29.h}, p0/z, [x11] add x11, x11, x2 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h srshr z0.h, p1/m, z0.h, #6 srshr z1.h, p1/m, z1.h, #6 ld1b {z31.h}, p0/z, [x11] add x11, x11, x2 srshr z2.h, p1/m, z2.h, #6 srshr z3.h, p1/m, z3.h, #6 ld1b {z30.h}, p0/z, [x11] add v0.8h, v0.8h, v28.8h add v1.8h, v1.8h, v29.8h add v2.8h, v2.8h, v30.8h add v3.8h, v3.8h, v31.8h sqxtunb z0.b, z0.h sqxtunb z1.b, z1.h sqxtunb z2.b, z2.h sqxtunb z3.b, z3.h st1b {z0.h}, p1, [x0] add x0, x0, x2 st1b {z1.h}, p1, [x0] add x0, x0, x2 st1b {z3.h}, p1, [x0] add x0, x0, x2 st1b {z2.h}, p1, [x0] ret endfunc x264-master/common/aarch64/dct-a.S000066400000000000000000001004701502133446700167150ustar00rootroot00000000000000/**************************************************************************** * dct-a.S: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "dct-a-common.S" const scan4x4_frame, align=4 .byte 0,1, 8,9, 2,3, 4,5 .byte 10,11, 16,17, 24,25, 18,19 .byte 12,13, 6,7, 14,15, 20,21 .byte 26,27, 28,29, 22,23, 30,31 endconst const scan4x4_field, align=4 .byte 0,1, 2,3, 8,9, 4,5 .byte 6,7, 10,11, 12,13, 14,15 endconst const sub4x4_frame, align=4 .byte 0, 1, 4, 8 .byte 5, 2, 3, 6 .byte 9, 12, 13, 10 .byte 7, 11, 14, 15 endconst const sub4x4_field, align=4 .byte 0, 4, 1, 8 .byte 12, 5, 9, 13 .byte 2, 6, 10, 14 .byte 3, 7, 11, 15 endconst // sum = a + (b>>shift) sub = (a>>shift) - b .macro SUMSUB_SHR shift sum sub a b t0 t1 sshr \t0, \b, #\shift sshr \t1, \a, #\shift add \sum, \a, \t0 sub \sub, \t1, \b .endm // sum = (a>>shift) + b sub = a - (b>>shift) .macro SUMSUB_SHR2 shift sum sub a b t0 t1 sshr \t0, \a, #\shift sshr \t1, \b, #\shift add \sum, \t0, \b sub \sub, \a, \t1 .endm // a += 1.5*ma b -= 1.5*mb .macro SUMSUB_15 a b ma mb t0 t1 sshr \t0, \ma, #1 sshr \t1, \mb, #1 add \t0, \t0, \ma add \t1, \t1, \mb add \a, \a, \t0 sub \b, \b, \t1 .endm function dct4x4dc_neon, export=1 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] movi v31.4h, #1 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h transpose v4.4h, v6.4h, v0.4h, v2.4h transpose v5.4h, v7.4h, v1.4h, v3.4h SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h transpose v4.2s, v5.2s, v0.2s, v1.2s transpose v6.2s, v7.2s, v2.2s, v3.2s add v16.4h, v4.4h, v31.4h add v17.4h, v6.4h, v31.4h srhadd v0.4h, v4.4h, v5.4h shsub v1.4h, v16.4h, v5.4h shsub v2.4h, v17.4h, v7.4h srhadd v3.4h, v6.4h, v7.4h st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] ret endfunc function idct4x4dc_neon, export=1 ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h transpose v4.4h, v6.4h, v0.4h, v2.4h transpose v5.4h, v7.4h, v1.4h, v3.4h SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h transpose v4.2s, v5.2s, v0.2s, v1.2s transpose v6.2s, v7.2s, v2.2s, v3.2s SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] ret endfunc function sub4x4_dct_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE ld1 {v0.s}[0], [x1], x3 ld1 {v1.s}[0], [x2], x4 ld1 {v2.s}[0], [x1], x3 usubl v16.8h, v0.8b, v1.8b ld1 {v3.s}[0], [x2], x4 ld1 {v4.s}[0], [x1], x3 usubl v17.8h, v2.8b, v3.8b ld1 {v5.s}[0], [x2], x4 ld1 {v6.s}[0], [x1], x3 usubl v18.8h, v4.8b, v5.8b ld1 {v7.s}[0], [x2], x4 usubl v19.8h, v6.8b, v7.8b DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] ret endfunc function sub8x4_dct_neon ld1 {v0.8b}, [x1], x3 ld1 {v1.8b}, [x2], x4 usubl v16.8h, v0.8b, v1.8b ld1 {v2.8b}, [x1], x3 ld1 {v3.8b}, [x2], x4 usubl v17.8h, v2.8b, v3.8b ld1 {v4.8b}, [x1], x3 ld1 {v5.8b}, [x2], x4 usubl v18.8h, v4.8b, v5.8b ld1 {v6.8b}, [x1], x3 ld1 {v7.8b}, [x2], x4 usubl v19.8h, v6.8b, v7.8b DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h add v22.8h, v19.8h, v19.8h add v21.8h, v18.8h, v18.8h add v0.8h, v16.8h, v17.8h sub v1.8h, v16.8h, v17.8h add v2.8h, v22.8h, v18.8h sub v3.8h, v19.8h, v21.8h zip1 v4.2d, v0.2d, v2.2d zip2 v6.2d, v0.2d, v2.2d zip1 v5.2d, v1.2d, v3.2d zip2 v7.2d, v1.2d, v3.2d st1 {v4.8h}, [x0], #16 st1 {v5.8h}, [x0], #16 st1 {v6.8h}, [x0], #16 st1 {v7.8h}, [x0], #16 ret endfunc function sub8x8_dct_neon, export=1 mov x5, x30 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE bl sub8x4_dct_neon mov x30, x5 b sub8x4_dct_neon endfunc function sub16x16_dct_neon, export=1 mov x5, x30 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE bl sub8x4_dct_neon bl sub8x4_dct_neon sub x1, x1, #8*FENC_STRIDE-8 sub x2, x2, #8*FDEC_STRIDE-8 bl sub8x4_dct_neon bl sub8x4_dct_neon sub x1, x1, #8 sub x2, x2, #8 bl sub8x4_dct_neon bl sub8x4_dct_neon sub x1, x1, #8*FENC_STRIDE-8 sub x2, x2, #8*FDEC_STRIDE-8 bl sub8x4_dct_neon mov x30, x5 b sub8x4_dct_neon endfunc .macro DCT8_1D type SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34 SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25 SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16 SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07 SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2 SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3 SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5 sshr v23.8h, v21.8h, #1 sshr v18.8h, v16.8h, #1 add v23.8h, v23.8h, v21.8h add v18.8h, v18.8h, v16.8h sub v30.8h, v30.8h, v23.8h sub v29.8h, v29.8h, v18.8h SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7 sshr v22.8h, v20.8h, #1 sshr v19.8h, v17.8h, #1 add v22.8h, v22.8h, v20.8h add v19.8h, v19.8h, v17.8h add v22.8h, v28.8h, v22.8h add v31.8h, v31.8h, v19.8h SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h .endm function sub8x8_dct8_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE ld1 {v16.8b}, [x1], x3 ld1 {v17.8b}, [x2], x4 ld1 {v18.8b}, [x1], x3 ld1 {v19.8b}, [x2], x4 usubl v0.8h, v16.8b, v17.8b ld1 {v20.8b}, [x1], x3 ld1 {v21.8b}, [x2], x4 usubl v1.8h, v18.8b, v19.8b ld1 {v22.8b}, [x1], x3 ld1 {v23.8b}, [x2], x4 usubl v2.8h, v20.8b, v21.8b ld1 {v24.8b}, [x1], x3 ld1 {v25.8b}, [x2], x4 usubl v3.8h, v22.8b, v23.8b ld1 {v26.8b}, [x1], x3 ld1 {v27.8b}, [x2], x4 usubl v4.8h, v24.8b, v25.8b ld1 {v28.8b}, [x1], x3 ld1 {v29.8b}, [x2], x4 usubl v5.8h, v26.8b, v27.8b ld1 {v30.8b}, [x1], x3 ld1 {v31.8b}, [x2], x4 usubl v6.8h, v28.8b, v29.8b usubl v7.8h, v30.8b, v31.8b DCT8_1D row transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 DCT8_1D col st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64 ret endfunc function sub16x16_dct8_neon, export=1 mov x7, x30 bl X(sub8x8_dct8_neon) sub x1, x1, #FENC_STRIDE*8 - 8 sub x2, x2, #FDEC_STRIDE*8 - 8 bl X(sub8x8_dct8_neon) sub x1, x1, #8 sub x2, x2, #8 bl X(sub8x8_dct8_neon) mov x30, x7 sub x1, x1, #FENC_STRIDE*8 - 8 sub x2, x2, #FDEC_STRIDE*8 - 8 b X(sub8x8_dct8_neon) endfunc // First part of IDCT (minus final SUMSUB_BA) .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3 SUMSUB_AB \d4, \d5, \d0, \d2 sshr \d7, \d1, #1 sshr \d6, \d3, #1 sub \d7, \d7, \d3 add \d6, \d6, \d1 .endm function add4x4_idct_neon, export=1 mov x2, #FDEC_STRIDE ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h ld1 {v28.s}[0], [x0], x2 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h ld1 {v29.s}[0], [x0], x2 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h srshr v0.4h, v0.4h, #6 srshr v1.4h, v1.4h, #6 ld1 {v31.s}[0], [x0], x2 srshr v2.4h, v2.4h, #6 srshr v3.4h, v3.4h, #6 ld1 {v30.s}[0], [x0], x2 sub x0, x0, x2, lsl #2 uaddw v0.8h, v0.8h, v28.8b uaddw v1.8h, v1.8h, v29.8b uaddw v2.8h, v2.8h, v30.8b uaddw v3.8h, v3.8h, v31.8b sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h st1 {v0.s}[0], [x0], x2 st1 {v1.s}[0], [x0], x2 st1 {v3.s}[0], [x0], x2 st1 {v2.s}[0], [x0], x2 ret endfunc function add8x4_idct_neon, export=1 ld1 {v0.8h,v1.8h}, [x1], #32 ld1 {v2.8h,v3.8h}, [x1], #32 transpose v20.2d, v21.2d, v0.2d, v2.2d transpose v22.2d, v23.2d, v1.2d, v3.2d IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h srshr v0.8h, v0.8h, #6 ld1 {v28.8b}, [x0], x2 srshr v1.8h, v1.8h, #6 ld1 {v29.8b}, [x0], x2 srshr v2.8h, v2.8h, #6 ld1 {v30.8b}, [x0], x2 srshr v3.8h, v3.8h, #6 ld1 {v31.8b}, [x0], x2 sub x0, x0, x2, lsl #2 uaddw v0.8h, v0.8h, v28.8b uaddw v1.8h, v1.8h, v29.8b uaddw v2.8h, v2.8h, v30.8b uaddw v3.8h, v3.8h, v31.8b sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h st1 {v0.8b}, [x0], x2 sqxtun v2.8b, v2.8h st1 {v1.8b}, [x0], x2 sqxtun v3.8b, v3.8h st1 {v2.8b}, [x0], x2 st1 {v3.8b}, [x0], x2 ret endfunc function add8x8_idct_neon, export=1 mov x2, #FDEC_STRIDE mov x5, x30 bl X(add8x4_idct_neon) mov x30, x5 b X(add8x4_idct_neon) endfunc function add16x16_idct_neon, export=1 mov x2, #FDEC_STRIDE mov x5, x30 bl X(add8x4_idct_neon) bl X(add8x4_idct_neon) sub x0, x0, #8*FDEC_STRIDE-8 bl X(add8x4_idct_neon) bl X(add8x4_idct_neon) sub x0, x0, #8 bl X(add8x4_idct_neon) bl X(add8x4_idct_neon) sub x0, x0, #8*FDEC_STRIDE-8 bl X(add8x4_idct_neon) mov x30, x5 b X(add8x4_idct_neon) endfunc .macro IDCT8_1D type SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2 .ifc \type, row ld1 {v22.8h,v23.8h}, [x1], #32 .endif SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4 SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1 SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3 SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5 SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7 SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6 SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4 SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h .endm function add8x8_idct8_neon, export=1 mov x2, #FDEC_STRIDE ld1 {v16.8h,v17.8h}, [x1], #32 ld1 {v18.8h,v19.8h}, [x1], #32 ld1 {v20.8h,v21.8h}, [x1], #32 IDCT8_1D row transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31 IDCT8_1D col ld1 {v0.8b}, [x0], x2 srshr v16.8h, v16.8h, #6 ld1 {v1.8b}, [x0], x2 srshr v17.8h, v17.8h, #6 ld1 {v2.8b}, [x0], x2 srshr v18.8h, v18.8h, #6 ld1 {v3.8b}, [x0], x2 srshr v19.8h, v19.8h, #6 ld1 {v4.8b}, [x0], x2 srshr v20.8h, v20.8h, #6 ld1 {v5.8b}, [x0], x2 srshr v21.8h, v21.8h, #6 ld1 {v6.8b}, [x0], x2 srshr v22.8h, v22.8h, #6 ld1 {v7.8b}, [x0], x2 srshr v23.8h, v23.8h, #6 sub x0, x0, x2, lsl #3 uaddw v16.8h, v16.8h, v0.8b uaddw v17.8h, v17.8h, v1.8b uaddw v18.8h, v18.8h, v2.8b sqxtun v0.8b, v16.8h sqxtun v1.8b, v17.8h sqxtun v2.8b, v18.8h uaddw v19.8h, v19.8h, v3.8b st1 {v0.8b}, [x0], x2 uaddw v20.8h, v20.8h, v4.8b st1 {v1.8b}, [x0], x2 uaddw v21.8h, v21.8h, v5.8b st1 {v2.8b}, [x0], x2 sqxtun v3.8b, v19.8h sqxtun v4.8b, v20.8h uaddw v22.8h, v22.8h, v6.8b uaddw v23.8h, v23.8h, v7.8b st1 {v3.8b}, [x0], x2 sqxtun v5.8b, v21.8h st1 {v4.8b}, [x0], x2 sqxtun v6.8b, v22.8h sqxtun v7.8b, v23.8h st1 {v5.8b}, [x0], x2 st1 {v6.8b}, [x0], x2 st1 {v7.8b}, [x0], x2 ret endfunc function add16x16_idct8_neon, export=1 mov x7, x30 bl X(add8x8_idct8_neon) sub x0, x0, #8*FDEC_STRIDE-8 bl X(add8x8_idct8_neon) sub x0, x0, #8 bl X(add8x8_idct8_neon) sub x0, x0, #8*FDEC_STRIDE-8 mov x30, x7 b X(add8x8_idct8_neon) endfunc function add8x8_idct_dc_neon, export=1 mov x2, #FDEC_STRIDE ld1 {v16.4h}, [x1] ld1 {v0.8b}, [x0], x2 srshr v16.4h, v16.4h, #6 ld1 {v1.8b}, [x0], x2 dup v20.8h, v16.h[0] dup v21.8h, v16.h[1] ld1 {v2.8b}, [x0], x2 dup v22.8h, v16.h[2] dup v23.8h, v16.h[3] ld1 {v3.8b}, [x0], x2 trn1 v20.2d, v20.2d, v21.2d ld1 {v4.8b}, [x0], x2 trn1 v21.2d, v22.2d, v23.2d ld1 {v5.8b}, [x0], x2 neg v22.8h, v20.8h ld1 {v6.8b}, [x0], x2 neg v23.8h, v21.8h ld1 {v7.8b}, [x0], x2 sub x0, x0, #8*FDEC_STRIDE sqxtun v20.8b, v20.8h sqxtun v21.8b, v21.8h sqxtun v22.8b, v22.8h sqxtun v23.8b, v23.8h uqadd v0.8b, v0.8b, v20.8b uqadd v1.8b, v1.8b, v20.8b uqadd v2.8b, v2.8b, v20.8b uqadd v3.8b, v3.8b, v20.8b uqadd v4.8b, v4.8b, v21.8b uqadd v5.8b, v5.8b, v21.8b uqadd v6.8b, v6.8b, v21.8b uqadd v7.8b, v7.8b, v21.8b uqsub v0.8b, v0.8b, v22.8b uqsub v1.8b, v1.8b, v22.8b uqsub v2.8b, v2.8b, v22.8b uqsub v3.8b, v3.8b, v22.8b uqsub v4.8b, v4.8b, v23.8b uqsub v5.8b, v5.8b, v23.8b uqsub v6.8b, v6.8b, v23.8b uqsub v7.8b, v7.8b, v23.8b st1 {v0.8b}, [x0], x2 st1 {v1.8b}, [x0], x2 st1 {v2.8b}, [x0], x2 st1 {v3.8b}, [x0], x2 st1 {v4.8b}, [x0], x2 st1 {v5.8b}, [x0], x2 st1 {v6.8b}, [x0], x2 st1 {v7.8b}, [x0], x2 ret endfunc .macro ADD16x4_IDCT_DC dc ld1 {v4.16b}, [x0], x3 dup v24.8h, \dc[0] dup v25.8h, \dc[1] ld1 {v5.16b}, [x0], x3 dup v26.8h, \dc[2] dup v27.8h, \dc[3] ld1 {v6.16b}, [x0], x3 trn1 v24.2d, v24.2d, v25.2d ld1 {v7.16b}, [x0], x3 trn1 v25.2d, v26.2d, v27.2d neg v26.8h, v24.8h neg v27.8h, v25.8h sqxtun v20.8b, v24.8h sqxtun v21.8b, v26.8h sqxtun2 v20.16b, v25.8h sqxtun2 v21.16b, v27.8h uqadd v4.16b, v4.16b, v20.16b uqadd v5.16b, v5.16b, v20.16b uqadd v6.16b, v6.16b, v20.16b uqadd v7.16b, v7.16b, v20.16b uqsub v4.16b, v4.16b, v21.16b uqsub v5.16b, v5.16b, v21.16b uqsub v6.16b, v6.16b, v21.16b st1 {v4.16b}, [x2], x3 uqsub v7.16b, v7.16b, v21.16b st1 {v5.16b}, [x2], x3 st1 {v6.16b}, [x2], x3 st1 {v7.16b}, [x2], x3 .endm function add16x16_idct_dc_neon, export=1 mov x2, x0 mov x3, #FDEC_STRIDE ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] srshr v0.4h, v0.4h, #6 srshr v1.4h, v1.4h, #6 ADD16x4_IDCT_DC v0.h srshr v2.4h, v2.4h, #6 ADD16x4_IDCT_DC v1.h srshr v3.4h, v3.4h, #6 ADD16x4_IDCT_DC v2.h ADD16x4_IDCT_DC v3.h ret endfunc .macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7 ld1 {\t0\().8b}, [x1], x3 ld1 {\t1\().8b}, [x2], x4 ld1 {\t2\().8b}, [x1], x3 ld1 {\t3\().8b}, [x2], x4 usubl \t0\().8h, \t0\().8b, \t1\().8b ld1 {\t4\().8b}, [x1], x3 ld1 {\t5\().8b}, [x2], x4 usubl \t1\().8h, \t2\().8b, \t3\().8b ld1 {\t6\().8b}, [x1], x3 ld1 {\t7\().8b}, [x2], x4 add \dst\().8h, \t0\().8h, \t1\().8h usubl \t2\().8h, \t4\().8b, \t5\().8b usubl \t3\().8h, \t6\().8b, \t7\().8b add \dst\().8h, \dst\().8h, \t2\().8h add \dst\().8h, \dst\().8h, \t3\().8h .endm function sub8x8_dct_dc_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 transpose v2.2d, v3.2d, v0.2d, v1.2d SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h transpose v2.2d, v3.2d, v0.2d, v1.2d SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h transpose v2.2d, v3.2d, v0.2d, v1.2d addp v0.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v0.8h st1 {v0.4h}, [x0] ret endfunc function sub8x16_dct_dc_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23 sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31 addp v4.8h, v0.8h, v2.8h addp v5.8h, v1.8h, v3.8h transpose v2.4s, v3.4s, v4.4s, v5.4s SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h transpose v2.4s, v3.4s, v0.4s, v1.4s SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h transpose v2.2d, v3.2d, v0.2d, v1.2d SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h trn1 v2.2d, v0.2d, v1.2d trn2 v3.2d, v1.2d, v0.2d addp v0.8h, v2.8h, v3.8h st1 {v0.8h}, [x0] ret endfunc function zigzag_interleave_8x8_cavlc_neon, export=1 mov x3, #7 movi v31.4s, #1 ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 umax v16.8h, v0.8h, v4.8h umax v17.8h, v1.8h, v5.8h umax v18.8h, v2.8h, v6.8h umax v19.8h, v3.8h, v7.8h st1 {v0.8h}, [x0], #16 st1 {v4.8h}, [x0], #16 umaxp v16.8h, v16.8h, v17.8h umaxp v18.8h, v18.8h, v19.8h st1 {v1.8h}, [x0], #16 st1 {v5.8h}, [x0], #16 umaxp v16.8h, v16.8h, v18.8h st1 {v2.8h}, [x0], #16 st1 {v6.8h}, [x0], #16 cmhs v16.4s, v16.4s, v31.4s st1 {v3.8h}, [x0], #16 and v16.16b, v16.16b, v31.16b st1 {v7.8h}, [x0], #16 st1 {v16.b}[0], [x2], #1 st1 {v16.b}[4], [x2], x3 st1 {v16.b}[8], [x2], #1 st1 {v16.b}[12], [x2] ret endfunc function zigzag_scan_4x4_frame_neon, export=1 movrel x2, scan4x4_frame ld1 {v0.16b,v1.16b}, [x1] ld1 {v16.16b,v17.16b}, [x2] tbl v2.16b, {v0.16b,v1.16b}, v16.16b tbl v3.16b, {v0.16b,v1.16b}, v17.16b st1 {v2.16b,v3.16b}, [x0] ret endfunc .macro zigzag_sub_4x4 f ac function zigzag_sub_4x4\ac\()_\f\()_neon, export=1 mov x9, #FENC_STRIDE mov x4, #FDEC_STRIDE movrel x5, sub4x4_\f mov x6, x2 ld1 {v0.s}[0], [x1], x9 ld1 {v0.s}[1], [x1], x9 ld1 {v0.s}[2], [x1], x9 ld1 {v0.s}[3], [x1], x9 ld1 {v16.16b}, [x5] ld1 {v1.s}[0], [x2], x4 ld1 {v1.s}[1], [x2], x4 ld1 {v1.s}[2], [x2], x4 ld1 {v1.s}[3], [x2], x4 tbl v2.16b, {v0.16b}, v16.16b tbl v3.16b, {v1.16b}, v16.16b st1 {v0.s}[0], [x6], x4 usubl v4.8h, v2.8b, v3.8b .ifc \ac, ac dup h7, v4.h[0] ins v4.h[0], wzr fmov w5, s7 strh w5, [x3] .endif usubl2 v5.8h, v2.16b, v3.16b st1 {v0.s}[1], [x6], x4 umax v6.8h, v4.8h, v5.8h umaxv h6, v6.8h st1 {v0.s}[2], [x6], x4 fmov w7, s6 st1 {v0.s}[3], [x6], x4 cmp w7, #0 st1 {v4.8h,v5.8h}, [x0] cset w0, ne ret endfunc .endm zigzag_sub_4x4 field zigzag_sub_4x4 field, ac zigzag_sub_4x4 frame zigzag_sub_4x4 frame, ac function zigzag_scan_4x4_field_neon, export=1 movrel x2, scan4x4_field ld1 {v0.8h,v1.8h}, [x1] ld1 {v16.16b}, [x2] tbl v0.16b, {v0.16b}, v16.16b st1 {v0.8h,v1.8h}, [x0] ret endfunc function zigzag_scan_8x8_frame_neon, export=1 movrel x2, scan8x8_frame ld1 {v0.8h,v1.8h}, [x1], #32 ld1 {v2.8h,v3.8h}, [x1], #32 ld1 {v4.8h,v5.8h}, [x1], #32 ld1 {v6.8h,v7.8h}, [x1] ld1 {v16.16b,v17.16b}, [x2], #32 ld1 {v18.16b,v19.16b}, [x2], #32 ld1 {v20.16b,v21.16b}, [x2], #32 ld1 {v22.16b,v23.16b}, [x2], #32 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b mov v25.h[6], v4.h[0] mov v25.h[7], v5.h[0] mov v26.h[0], v4.h[1] mov v27.h[4], v7.h[0] mov v28.h[7], v4.h[4] mov v29.h[7], v3.h[6] mov v30.h[0], v2.h[7] mov v30.h[1], v3.h[7] st1 {v24.8h,v25.8h}, [x0], #32 st1 {v26.8h,v27.8h}, [x0], #32 st1 {v28.8h,v29.8h}, [x0], #32 st1 {v30.8h,v31.8h}, [x0] ret endfunc #define Z(z) 2*(z), 2*(z)+1 #define T(x,y) Z(x*8+y) const scan8x8_frame, align=5 .byte T(0,0), T(1,0), T(0,1), T(0,2) .byte T(1,1), T(2,0), T(3,0), T(2,1) .byte T(1,2), T(0,3), T(0,4), T(1,3) .byte T(2,2), T(3,1), T(4,0), T(5,0) .byte T(4,1), T(3,2), T(2,3), T(1,4) .byte T(0,5), T(0,6), T(1,5), T(2,4) #undef T #define T(x,y) Z((x-3)*8+y) .byte T(3,3), T(4,2), T(5,1), T(6,0) .byte T(7,0), T(6,1), T(5,2), T(4,3) #undef T #define T(x,y) Z((x-0)*8+y) .byte T(3,4), T(2,5), T(1,6), T(0,7) .byte T(1,7), T(2,6), T(3,5), T(4,4) #undef T #define T(x,y) Z((x-4)*8+y) .byte T(5,3), T(6,2), T(7,1), T(7,2) .byte T(6,3), T(5,4), T(4,5), T(3,6) .byte T(2,7), T(3,7), T(4,6), T(5,5) .byte T(6,4), T(7,3), T(7,4), T(6,5) .byte T(5,6), T(4,7), T(5,7), T(6,6) .byte T(7,5), T(7,6), T(6,7), T(7,7) endconst function zigzag_scan_8x8_field_neon, export=1 movrel x2, scan8x8_field ld1 {v0.8h,v1.8h}, [x1], #32 ld1 {v2.8h,v3.8h}, [x1], #32 ld1 {v4.8h,v5.8h}, [x1], #32 ld1 {v6.8h,v7.8h}, [x1] ld1 {v16.16b,v17.16b}, [x2], #32 ld1 {v18.16b,v19.16b}, [x2], #32 ld1 {v20.16b,v21.16b}, [x2], #32 ld1 {v22.16b}, [x2] ext v31.16b, v7.16b, v7.16b, #4 tbl v24.16b, {v0.16b,v1.16b}, v16.16b tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b ext v31.16b, v6.16b, v31.16b, #12 st1 {v24.8h,v25.8h}, [x0], #32 st1 {v26.8h,v27.8h}, [x0], #32 st1 {v28.8h,v29.8h}, [x0], #32 st1 {v30.8h,v31.8h}, [x0] ret endfunc .macro zigzag_sub8x8 f function zigzag_sub_8x8_\f\()_neon, export=1 movrel x4, sub8x8_\f mov x5, #FENC_STRIDE mov x6, #FDEC_STRIDE mov x7, x2 ld1 {v0.d}[0], [x1], x5 ld1 {v0.d}[1], [x1], x5 ld1 {v1.d}[0], [x1], x5 ld1 {v1.d}[1], [x1], x5 ld1 {v2.d}[0], [x1], x5 ld1 {v2.d}[1], [x1], x5 ld1 {v3.d}[0], [x1], x5 ld1 {v3.d}[1], [x1] ld1 {v4.d}[0], [x2], x6 ld1 {v4.d}[1], [x2], x6 ld1 {v5.d}[0], [x2], x6 ld1 {v5.d}[1], [x2], x6 ld1 {v6.d}[0], [x2], x6 ld1 {v6.d}[1], [x2], x6 ld1 {v7.d}[0], [x2], x6 ld1 {v7.d}[1], [x2] ld1 {v16.16b,v17.16b}, [x4], #32 ld1 {v18.16b,v19.16b}, [x4], #32 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b usubl v4.8h, v24.8b, v28.8b usubl2 v5.8h, v24.16b, v28.16b usubl v6.8h, v25.8b, v29.8b usubl2 v7.8h, v25.16b, v29.16b usubl v16.8h, v26.8b, v30.8b usubl2 v17.8h, v26.16b, v30.16b usubl v18.8h, v27.8b, v31.8b usubl2 v19.8h, v27.16b, v31.16b umax v20.8h, v4.8h, v5.8h umax v21.8h, v6.8h, v7.8h umax v22.8h, v16.8h, v17.8h umax v23.8h, v18.8h, v19.8h umax v20.8h, v20.8h, v21.8h umax v21.8h, v22.8h, v23.8h umax v20.8h, v20.8h, v21.8h umaxv h22, v20.8h st1 {v0.d}[0], [x7], x6 st1 {v0.d}[1], [x7], x6 st1 {v1.d}[0], [x7], x6 st1 {v1.d}[1], [x7], x6 st1 {v2.d}[0], [x7], x6 st1 {v2.d}[1], [x7], x6 st1 {v3.d}[0], [x7], x6 st1 {v3.d}[1], [x7] st1 {v4.8h,v5.8h}, [x0], #32 st1 {v6.8h,v7.8h}, [x0], #32 st1 {v16.8h,v17.8h}, [x0], #32 st1 {v18.8h,v19.8h}, [x0] fmov w9, s22 cmp w9, #0 cset w0, ne ret endfunc .endm zigzag_sub8x8 field zigzag_sub8x8 frame #undef T #define T(x,y) Z(x*8+y) const scan8x8_field, align=5 .byte T(0,0), T(0,1), T(0,2), T(1,0) .byte T(1,1), T(0,3), T(0,4), T(1,2) .byte T(2,0), T(1,3), T(0,5), T(0,6) .byte T(0,7), T(1,4), T(2,1), T(3,0) #undef T #define T(x,y) Z((x-1)*8+y) .byte T(2,2), T(1,5), T(1,6), T(1,7) .byte T(2,3), T(3,1), T(4,0), T(3,2) #undef T #define T(x,y) Z((x-2)*8+y) .byte T(2,4), T(2,5), T(2,6), T(2,7) .byte T(3,3), T(4,1), T(5,0), T(4,2) #undef T #define T(x,y) Z((x-3)*8+y) .byte T(3,4), T(3,5), T(3,6), T(3,7) .byte T(4,3), T(5,1), T(6,0), T(5,2) #undef T #define T(x,y) Z((x-4)*8+y) .byte T(4,4), T(4,5), T(4,6), T(4,7) .byte T(5,3), T(6,1), T(6,2), T(5,4) #undef T #define T(x,y) Z((x-5)*8+y) .byte T(5,5), T(5,6), T(5,7), T(6,3) .byte T(7,0), T(7,1), T(6,4), T(6,5) endconst #undef T #define T(y,x) x*8+y const sub8x8_frame, align=5 .byte T(0,0), T(1,0), T(0,1), T(0,2) .byte T(1,1), T(2,0), T(3,0), T(2,1) .byte T(1,2), T(0,3), T(0,4), T(1,3) .byte T(2,2), T(3,1), T(4,0), T(5,0) .byte T(4,1), T(3,2), T(2,3), T(1,4) .byte T(0,5), T(0,6), T(1,5), T(2,4) .byte T(3,3), T(4,2), T(5,1), T(6,0) .byte T(7,0), T(6,1), T(5,2), T(4,3) .byte T(3,4), T(2,5), T(1,6), T(0,7) .byte T(1,7), T(2,6), T(3,5), T(4,4) .byte T(5,3), T(6,2), T(7,1), T(7,2) .byte T(6,3), T(5,4), T(4,5), T(3,6) .byte T(2,7), T(3,7), T(4,6), T(5,5) .byte T(6,4), T(7,3), T(7,4), T(6,5) .byte T(5,6), T(4,7), T(5,7), T(6,6) .byte T(7,5), T(7,6), T(6,7), T(7,7) endconst const sub8x8_field, align=5 .byte T(0,0), T(0,1), T(0,2), T(1,0) .byte T(1,1), T(0,3), T(0,4), T(1,2) .byte T(2,0), T(1,3), T(0,5), T(0,6) .byte T(0,7), T(1,4), T(2,1), T(3,0) .byte T(2,2), T(1,5), T(1,6), T(1,7) .byte T(2,3), T(3,1), T(4,0), T(3,2) .byte T(2,4), T(2,5), T(2,6), T(2,7) .byte T(3,3), T(4,1), T(5,0), T(4,2) .byte T(3,4), T(3,5), T(3,6), T(3,7) .byte T(4,3), T(5,1), T(6,0), T(5,2) .byte T(4,4), T(4,5), T(4,6), T(4,7) .byte T(5,3), T(6,1), T(6,2), T(5,4) .byte T(5,5), T(5,6), T(5,7), T(6,3) .byte T(7,0), T(7,1), T(6,4), T(6,5) .byte T(6,6), T(6,7), T(7,2), T(7,3) .byte T(7,4), T(7,5), T(7,6), T(7,7) endconst x264-master/common/aarch64/dct.h000066400000000000000000000132271502133446700165270ustar00rootroot00000000000000/***************************************************************************** * dct.h: aarch64 transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_DCT_H #define X264_AARCH64_DCT_H #define x264_dct4x4dc_neon x264_template(dct4x4dc_neon) void x264_dct4x4dc_neon( int16_t d[16] ); #define x264_idct4x4dc_neon x264_template(idct4x4dc_neon) void x264_idct4x4dc_neon( int16_t d[16] ); #define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon) void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon) void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon) void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_add4x4_idct_neon x264_template(add4x4_idct_neon) void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] ); #define x264_add8x8_idct_neon x264_template(add8x8_idct_neon) void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] ); #define x264_add16x16_idct_neon x264_template(add16x16_idct_neon) void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); #define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon) void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); #define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon) void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); #define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon) void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon) void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon) void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon) void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); #define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon) void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] ); #define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon) void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); #define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon) void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon) void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon) void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon) void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon) int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); #define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon) int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); #define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon) int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); #define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon) int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); #define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon) int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); #define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon) int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); #define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon) void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve) void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); #define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2) void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] ); #define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve) void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #endif x264-master/common/aarch64/deblock-a-common.S000066400000000000000000000033031502133446700210310ustar00rootroot00000000000000/***************************************************************************** * deblock-a-common.S: aarch64 deblocking ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Mans Rullgard * Janne Grunau * David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ // This file contains the NEON macros that are intended to be used by // the SVE/SVE2 functions as well .macro h264_loop_filter_start cmp w2, #0 ldr w6, [x4] ccmp w3, #0, #0, ne mov v24.s[0], w6 and w8, w6, w6, lsl #16 b.eq 1f ands w8, w8, w8, lsl #8 b.ge 2f 1: ret 2: .endm x264-master/common/aarch64/deblock-a-sve.S000066400000000000000000000071531502133446700203450ustar00rootroot00000000000000/***************************************************************************** * deblock-a-sve.S: aarch64 deblocking ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "deblock-a-common.S" ENABLE_SVE .macro h264_loop_filter_chroma_sve ptrue p0.b, vl16 dup v22.16b, w2 // alpha uxtl v24.8h, v24.8b uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0) uxtl v4.8h, v0.8b uxtl2 v5.8h, v0.16b uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) usubw v4.8h, v4.8h, v16.8b usubw2 v5.8h, v5.8h, v16.16b sli v24.8h, v24.8h, #8 shl v4.8h, v4.8h, #2 shl v5.8h, v5.8h, #2 uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) uxtl v24.4s, v24.4h uaddw v4.8h, v4.8h, v18.8b uaddw2 v5.8h, v5.8h, v18.16b cmphi p1.b, p0/z, z22.b, z26.b usubw v4.8h, v4.8h, v2.8b usubw2 v5.8h, v5.8h, v2.16b sli v24.4s, v24.4s, #16 dup v22.16b, w3 // beta rshrn v4.8b, v4.8h, #3 rshrn2 v4.16b, v5.8h, #3 cmphi p2.b, p0/z, z22.b, z28.b cmphi p3.b, p0/z, z22.b, z30.b smin v4.16b, v4.16b, v24.16b neg v25.16b, v24.16b and p1.b, p0/z, p1.b, p2.b smax v4.16b, v4.16b, v25.16b and p1.b, p0/z, p1.b, p3.b uxtl v22.8h, v0.8b uxtl2 v23.8h, v0.16b uxtl v28.8h, v16.8b uxtl2 v29.8h, v16.16b saddw v28.8h, v28.8h, v4.8b saddw2 v29.8h, v29.8h, v4.16b ssubw v22.8h, v22.8h, v4.8b ssubw2 v23.8h, v23.8h, v4.16b sqxtun v16.8b, v28.8h sqxtun v0.8b, v22.8h sqxtun2 v16.16b, v29.8h sqxtun2 v0.16b, v23.8h .endm function deblock_v_chroma_sve, export=1 h264_loop_filter_start sub x0, x0, x1, lsl #1 // No performance improvement if sve load is used. So, continue using // NEON load here ld1 {v18.16b}, [x0], x1 ld1 {v16.16b}, [x0], x1 ld1 {v0.16b}, [x0], x1 ld1 {v2.16b}, [x0] h264_loop_filter_chroma_sve sub x0, x0, x1, lsl #1 st1b {z16.b}, p1, [x0] add x0, x0, x1 st1b {z0.b}, p1, [x0] ret endfunc x264-master/common/aarch64/deblock-a.S000066400000000000000000000707531502133446700175600ustar00rootroot00000000000000/***************************************************************************** * deblock.S: aarch64 deblocking ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Mans Rullgard * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "deblock-a-common.S" .macro h264_loop_filter_luma dup v22.16b, w2 // alpha uxtl v24.8h, v24.8b uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0) uxtl v24.4s, v24.4h uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) sli v24.8h, v24.8h, #8 uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) sli v24.4s, v24.4s, #16 cmhi v21.16b, v22.16b, v21.16b // < alpha dup v22.16b, w3 // beta cmlt v23.16b, v24.16b, #0 cmhi v28.16b, v22.16b, v28.16b // < beta cmhi v30.16b, v22.16b, v30.16b // < beta bic v21.16b, v21.16b, v23.16b uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0) and v21.16b, v21.16b, v28.16b uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0) cmhi v17.16b, v22.16b, v17.16b // < beta and v21.16b, v21.16b, v30.16b cmhi v19.16b, v22.16b, v19.16b // < beta and v17.16b, v17.16b, v21.16b and v19.16b, v19.16b, v21.16b and v24.16b, v24.16b, v21.16b urhadd v28.16b, v16.16b, v0.16b sub v21.16b, v24.16b, v17.16b uqadd v23.16b, v18.16b, v24.16b uhadd v20.16b, v20.16b, v28.16b sub v21.16b, v21.16b, v19.16b uhadd v28.16b, v4.16b, v28.16b umin v23.16b, v23.16b, v20.16b uqsub v22.16b, v18.16b, v24.16b uqadd v4.16b, v2.16b, v24.16b umax v23.16b, v23.16b, v22.16b uqsub v22.16b, v2.16b, v24.16b umin v28.16b, v4.16b, v28.16b uxtl v4.8h, v0.8b umax v28.16b, v28.16b, v22.16b uxtl2 v20.8h, v0.16b usubw v4.8h, v4.8h, v16.8b usubw2 v20.8h, v20.8h, v16.16b shl v4.8h, v4.8h, #2 shl v20.8h, v20.8h, #2 uaddw v4.8h, v4.8h, v18.8b uaddw2 v20.8h, v20.8h, v18.16b usubw v4.8h, v4.8h, v2.8b usubw2 v20.8h, v20.8h, v2.16b rshrn v4.8b, v4.8h, #3 rshrn2 v4.16b, v20.8h, #3 bsl v17.16b, v23.16b, v18.16b bsl v19.16b, v28.16b, v2.16b neg v23.16b, v21.16b uxtl v28.8h, v16.8b smin v4.16b, v4.16b, v21.16b uxtl2 v21.8h, v16.16b smax v4.16b, v4.16b, v23.16b uxtl v22.8h, v0.8b uxtl2 v24.8h, v0.16b saddw v28.8h, v28.8h, v4.8b saddw2 v21.8h, v21.8h, v4.16b ssubw v22.8h, v22.8h, v4.8b ssubw2 v24.8h, v24.8h, v4.16b sqxtun v16.8b, v28.8h sqxtun2 v16.16b, v21.8h sqxtun v0.8b, v22.8h sqxtun2 v0.16b, v24.8h .endm function deblock_v_luma_neon, export=1 h264_loop_filter_start ld1 {v0.16b}, [x0], x1 ld1 {v2.16b}, [x0], x1 ld1 {v4.16b}, [x0], x1 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 ld1 {v20.16b}, [x0], x1 ld1 {v18.16b}, [x0], x1 ld1 {v16.16b}, [x0], x1 h264_loop_filter_luma sub x0, x0, x1, lsl #1 st1 {v17.16b}, [x0], x1 st1 {v16.16b}, [x0], x1 st1 {v0.16b}, [x0], x1 st1 {v19.16b}, [x0] ret endfunc function deblock_h_luma_neon, export=1 h264_loop_filter_start sub x0, x0, #4 ld1 {v6.8b}, [x0], x1 ld1 {v20.8b}, [x0], x1 ld1 {v18.8b}, [x0], x1 ld1 {v16.8b}, [x0], x1 ld1 {v0.8b}, [x0], x1 ld1 {v2.8b}, [x0], x1 ld1 {v4.8b}, [x0], x1 ld1 {v26.8b}, [x0], x1 ld1 {v6.d}[1], [x0], x1 ld1 {v20.d}[1], [x0], x1 ld1 {v18.d}[1], [x0], x1 ld1 {v16.d}[1], [x0], x1 ld1 {v0.d}[1], [x0], x1 ld1 {v2.d}[1], [x0], x1 ld1 {v4.d}[1], [x0], x1 ld1 {v26.d}[1], [x0], x1 transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 h264_loop_filter_luma transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27 sub x0, x0, x1, lsl #4 add x0, x0, #2 st1 {v17.s}[0], [x0], x1 st1 {v16.s}[0], [x0], x1 st1 {v0.s}[0], [x0], x1 st1 {v19.s}[0], [x0], x1 st1 {v17.s}[1], [x0], x1 st1 {v16.s}[1], [x0], x1 st1 {v0.s}[1], [x0], x1 st1 {v19.s}[1], [x0], x1 st1 {v17.s}[2], [x0], x1 st1 {v16.s}[2], [x0], x1 st1 {v0.s}[2], [x0], x1 st1 {v19.s}[2], [x0], x1 st1 {v17.s}[3], [x0], x1 st1 {v16.s}[3], [x0], x1 st1 {v0.s}[3], [x0], x1 st1 {v19.s}[3], [x0], x1 ret endfunc .macro h264_loop_filter_start_intra orr w4, w2, w3 cmp w4, #0 b.ne 1f ret 1: dup v30.16b, w2 // alpha dup v31.16b, w3 // beta .endm .macro h264_loop_filter_luma_intra uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) cmhi v19.16b, v30.16b, v16.16b // < alpha cmhi v17.16b, v31.16b, v17.16b // < beta cmhi v18.16b, v31.16b, v18.16b // < beta movi v29.16b, #2 ushr v30.16b, v30.16b, #2 // alpha >> 2 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 and v19.16b, v19.16b, v17.16b and v19.16b, v19.16b, v18.16b shrn v20.8b, v19.8h, #4 mov x4, v20.d[0] cbz x4, 9f ushll v20.8h, v6.8b, #1 ushll v22.8h, v1.8b, #1 ushll2 v21.8h, v6.16b, #1 ushll2 v23.8h, v1.16b, #1 uaddw v20.8h, v20.8h, v7.8b uaddw v22.8h, v22.8h, v0.8b uaddw2 v21.8h, v21.8h, v7.16b uaddw2 v23.8h, v23.8h, v0.16b uaddw v20.8h, v20.8h, v1.8b uaddw v22.8h, v22.8h, v6.8b uaddw2 v21.8h, v21.8h, v1.16b uaddw2 v23.8h, v23.8h, v6.16b rshrn v24.8b, v20.8h, #2 // p0'_1 rshrn v25.8b, v22.8h, #2 // q0'_1 rshrn2 v24.16b, v21.8h, #2 // p0'_1 rshrn2 v25.16b, v23.8h, #2 // q0'_1 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) cmhi v17.16b, v31.16b, v17.16b // < beta cmhi v18.16b, v31.16b, v18.16b // < beta and v17.16b, v16.16b, v17.16b // if_2 && if_3 and v18.16b, v16.16b, v18.16b // if_2 && if_4 not v30.16b, v17.16b not v31.16b, v18.16b and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 uaddl v26.8h, v5.8b, v7.8b uaddl2 v27.8h, v5.16b, v7.16b uaddw v26.8h, v26.8h, v0.8b uaddw2 v27.8h, v27.8h, v0.16b add v20.8h, v20.8h, v26.8h add v21.8h, v21.8h, v27.8h uaddw v20.8h, v20.8h, v0.8b uaddw2 v21.8h, v21.8h, v0.16b rshrn v20.8b, v20.8h, #3 // p0'_2 rshrn2 v20.16b, v21.8h, #3 // p0'_2 uaddw v26.8h, v26.8h, v6.8b uaddw2 v27.8h, v27.8h, v6.16b rshrn v21.8b, v26.8h, #2 // p1'_2 rshrn2 v21.16b, v27.8h, #2 // p1'_2 uaddl v28.8h, v4.8b, v5.8b uaddl2 v29.8h, v4.16b, v5.16b shl v28.8h, v28.8h, #1 shl v29.8h, v29.8h, #1 add v28.8h, v28.8h, v26.8h add v29.8h, v29.8h, v27.8h rshrn v19.8b, v28.8h, #3 // p2'_2 rshrn2 v19.16b, v29.8h, #3 // p2'_2 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 uaddl v26.8h, v2.8b, v0.8b uaddl2 v27.8h, v2.16b, v0.16b uaddw v26.8h, v26.8h, v7.8b uaddw2 v27.8h, v27.8h, v7.16b add v22.8h, v22.8h, v26.8h add v23.8h, v23.8h, v27.8h uaddw v22.8h, v22.8h, v7.8b uaddw2 v23.8h, v23.8h, v7.16b rshrn v22.8b, v22.8h, #3 // q0'_2 rshrn2 v22.16b, v23.8h, #3 // q0'_2 uaddw v26.8h, v26.8h, v1.8b uaddw2 v27.8h, v27.8h, v1.16b rshrn v23.8b, v26.8h, #2 // q1'_2 rshrn2 v23.16b, v27.8h, #2 // q1'_2 uaddl v28.8h, v2.8b, v3.8b uaddl2 v29.8h, v2.16b, v3.16b shl v28.8h, v28.8h, #1 shl v29.8h, v29.8h, #1 add v28.8h, v28.8h, v26.8h add v29.8h, v29.8h, v27.8h rshrn v26.8b, v28.8h, #3 // q2'_2 rshrn2 v26.16b, v29.8h, #3 // q2'_2 bit v7.16b, v24.16b, v30.16b // p0'_1 bit v0.16b, v25.16b, v31.16b // q0'_1 bit v7.16b, v20.16b, v17.16b // p0'_2 bit v6.16b, v21.16b, v17.16b // p1'_2 bit v5.16b, v19.16b, v17.16b // p2'_2 bit v0.16b, v22.16b, v18.16b // q0'_2 bit v1.16b, v23.16b, v18.16b // q1'_2 bit v2.16b, v26.16b, v18.16b // q2'_2 .endm function deblock_v_luma_intra_neon, export=1 h264_loop_filter_start_intra ld1 {v0.16b}, [x0], x1 // q0 ld1 {v1.16b}, [x0], x1 // q1 ld1 {v2.16b}, [x0], x1 // q2 ld1 {v3.16b}, [x0], x1 // q3 sub x0, x0, x1, lsl #3 ld1 {v4.16b}, [x0], x1 // p3 ld1 {v5.16b}, [x0], x1 // p2 ld1 {v6.16b}, [x0], x1 // p1 ld1 {v7.16b}, [x0] // p0 h264_loop_filter_luma_intra sub x0, x0, x1, lsl #1 st1 {v5.16b}, [x0], x1 // p2 st1 {v6.16b}, [x0], x1 // p1 st1 {v7.16b}, [x0], x1 // p0 st1 {v0.16b}, [x0], x1 // q0 st1 {v1.16b}, [x0], x1 // q1 st1 {v2.16b}, [x0] // q2 9: ret endfunc function deblock_h_luma_intra_neon, export=1 h264_loop_filter_start_intra sub x0, x0, #4 ld1 {v4.8b}, [x0], x1 ld1 {v5.8b}, [x0], x1 ld1 {v6.8b}, [x0], x1 ld1 {v7.8b}, [x0], x1 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x0], x1 ld1 {v2.8b}, [x0], x1 ld1 {v3.8b}, [x0], x1 ld1 {v4.d}[1], [x0], x1 ld1 {v5.d}[1], [x0], x1 ld1 {v6.d}[1], [x0], x1 ld1 {v7.d}[1], [x0], x1 ld1 {v0.d}[1], [x0], x1 ld1 {v1.d}[1], [x0], x1 ld1 {v2.d}[1], [x0], x1 ld1 {v3.d}[1], [x0], x1 transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 h264_loop_filter_luma_intra transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 sub x0, x0, x1, lsl #4 st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x0], x1 st1 {v6.8b}, [x0], x1 st1 {v7.8b}, [x0], x1 st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x0], x1 st1 {v4.d}[1], [x0], x1 st1 {v5.d}[1], [x0], x1 st1 {v6.d}[1], [x0], x1 st1 {v7.d}[1], [x0], x1 st1 {v0.d}[1], [x0], x1 st1 {v1.d}[1], [x0], x1 st1 {v2.d}[1], [x0], x1 st1 {v3.d}[1], [x0], x1 9: ret endfunc .macro h264_loop_filter_chroma dup v22.16b, w2 // alpha uxtl v24.8h, v24.8b uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0) uxtl v4.8h, v0.8b uxtl2 v5.8h, v0.16b uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) usubw v4.8h, v4.8h, v16.8b usubw2 v5.8h, v5.8h, v16.16b sli v24.8h, v24.8h, #8 shl v4.8h, v4.8h, #2 shl v5.8h, v5.8h, #2 uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) uxtl v24.4s, v24.4h uaddw v4.8h, v4.8h, v18.8b uaddw2 v5.8h, v5.8h, v18.16b cmhi v26.16b, v22.16b, v26.16b // < alpha usubw v4.8h, v4.8h, v2.8b usubw2 v5.8h, v5.8h, v2.16b sli v24.4s, v24.4s, #16 dup v22.16b, w3 // beta rshrn v4.8b, v4.8h, #3 rshrn2 v4.16b, v5.8h, #3 cmhi v28.16b, v22.16b, v28.16b // < beta cmhi v30.16b, v22.16b, v30.16b // < beta smin v4.16b, v4.16b, v24.16b neg v25.16b, v24.16b and v26.16b, v26.16b, v28.16b smax v4.16b, v4.16b, v25.16b and v26.16b, v26.16b, v30.16b uxtl v22.8h, v0.8b uxtl2 v23.8h, v0.16b and v4.16b, v4.16b, v26.16b uxtl v28.8h, v16.8b uxtl2 v29.8h, v16.16b saddw v28.8h, v28.8h, v4.8b saddw2 v29.8h, v29.8h, v4.16b ssubw v22.8h, v22.8h, v4.8b ssubw2 v23.8h, v23.8h, v4.16b sqxtun v16.8b, v28.8h sqxtun v0.8b, v22.8h sqxtun2 v16.16b, v29.8h sqxtun2 v0.16b, v23.8h .endm function deblock_v_chroma_neon, export=1 h264_loop_filter_start sub x0, x0, x1, lsl #1 ld1 {v18.16b}, [x0], x1 ld1 {v16.16b}, [x0], x1 ld1 {v0.16b}, [x0], x1 ld1 {v2.16b}, [x0] h264_loop_filter_chroma sub x0, x0, x1, lsl #1 st1 {v16.16b}, [x0], x1 st1 {v0.16b}, [x0], x1 ret endfunc function deblock_h_chroma_neon, export=1 h264_loop_filter_start sub x0, x0, #4 deblock_h_chroma: ld1 {v18.d}[0], [x0], x1 ld1 {v16.d}[0], [x0], x1 ld1 {v0.d}[0], [x0], x1 ld1 {v2.d}[0], [x0], x1 ld1 {v18.d}[1], [x0], x1 ld1 {v16.d}[1], [x0], x1 ld1 {v0.d}[1], [x0], x1 ld1 {v2.d}[1], [x0], x1 transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31 h264_loop_filter_chroma transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31 sub x0, x0, x1, lsl #3 st1 {v18.d}[0], [x0], x1 st1 {v16.d}[0], [x0], x1 st1 {v0.d}[0], [x0], x1 st1 {v2.d}[0], [x0], x1 st1 {v18.d}[1], [x0], x1 st1 {v16.d}[1], [x0], x1 st1 {v0.d}[1], [x0], x1 st1 {v2.d}[1], [x0], x1 ret endfunc function deblock_h_chroma_422_neon, export=1 add x5, x0, x1 sub x0, x0, #4 add x1, x1, x1 h264_loop_filter_start mov x7, x30 bl deblock_h_chroma mov x30, x7 sub x0, x5, #4 mov v24.s[0], w6 b deblock_h_chroma endfunc .macro h264_loop_filter_chroma8 dup v22.8b, w2 // alpha uxtl v24.8h, v24.8b uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) uxtl v4.8h, v17.8b uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0) usubw v4.8h, v4.8h, v16.8b sli v24.8h, v24.8h, #8 shl v4.8h, v4.8h, #2 uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0) uaddw v4.8h, v4.8h, v18.8b cmhi v26.8b, v22.8b, v26.8b // < alpha usubw v4.8h, v4.8h, v19.8b dup v22.8b, w3 // beta rshrn v4.8b, v4.8h, #3 cmhi v28.8b, v22.8b, v28.8b // < beta cmhi v30.8b, v22.8b, v30.8b // < beta smin v4.8b, v4.8b, v24.8b neg v25.8b, v24.8b and v26.8b, v26.8b, v28.8b smax v4.8b, v4.8b, v25.8b and v26.8b, v26.8b, v30.8b uxtl v22.8h, v17.8b and v4.8b, v4.8b, v26.8b uxtl v28.8h, v16.8b saddw v28.8h, v28.8h, v4.8b ssubw v22.8h, v22.8h, v4.8b sqxtun v16.8b, v28.8h sqxtun v17.8b, v22.8h .endm function deblock_h_chroma_mbaff_neon, export=1 h264_loop_filter_start sub x4, x0, #4 sub x0, x0, #2 ld1 {v18.8b}, [x4], x1 ld1 {v16.8b}, [x4], x1 ld1 {v17.8b}, [x4], x1 ld1 {v19.8b}, [x4] transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31 h264_loop_filter_chroma8 st2 {v16.h,v17.h}[0], [x0], x1 st2 {v16.h,v17.h}[1], [x0], x1 st2 {v16.h,v17.h}[2], [x0], x1 st2 {v16.h,v17.h}[3], [x0] ret endfunc .macro h264_loop_filter_chroma_intra width=16 uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0) uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0) uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0) cmhi v26.16b, v30.16b, v26.16b // < alpha cmhi v27.16b, v31.16b, v27.16b // < beta cmhi v28.16b, v31.16b, v28.16b // < beta and v26.16b, v26.16b, v27.16b and v26.16b, v26.16b, v28.16b ushll v4.8h, v18.8b, #1 ushll v6.8h, v19.8b, #1 .ifc \width, 16 ushll2 v5.8h, v18.16b, #1 ushll2 v7.8h, v19.16b, #1 uaddl2 v21.8h, v16.16b, v19.16b uaddl2 v23.8h, v17.16b, v18.16b .endif uaddl v20.8h, v16.8b, v19.8b uaddl v22.8h, v17.8b, v18.8b add v20.8h, v20.8h, v4.8h // mlal? add v22.8h, v22.8h, v6.8h .ifc \width, 16 add v21.8h, v21.8h, v5.8h add v23.8h, v23.8h, v7.8h .endif uqrshrn v24.8b, v20.8h, #2 uqrshrn v25.8b, v22.8h, #2 .ifc \width, 16 uqrshrn2 v24.16b, v21.8h, #2 uqrshrn2 v25.16b, v23.8h, #2 .endif bit v16.16b, v24.16b, v26.16b bit v17.16b, v25.16b, v26.16b .endm function deblock_v_chroma_intra_neon, export=1 h264_loop_filter_start_intra sub x0, x0, x1, lsl #1 ld1 {v18.16b}, [x0], x1 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x0], x1 ld1 {v19.16b}, [x0] h264_loop_filter_chroma_intra sub x0, x0, x1, lsl #1 st1 {v16.16b}, [x0], x1 st1 {v17.16b}, [x0], x1 ret endfunc function deblock_h_chroma_intra_mbaff_neon, export=1 h264_loop_filter_start_intra sub x4, x0, #4 sub x0, x0, #2 ld1 {v18.8b}, [x4], x1 ld1 {v16.8b}, [x4], x1 ld1 {v17.8b}, [x4], x1 ld1 {v19.8b}, [x4], x1 transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra width=8 st2 {v16.h,v17.h}[0], [x0], x1 st2 {v16.h,v17.h}[1], [x0], x1 st2 {v16.h,v17.h}[2], [x0], x1 st2 {v16.h,v17.h}[3], [x0], x1 ret endfunc function deblock_h_chroma_intra_neon, export=1 h264_loop_filter_start_intra sub x4, x0, #4 sub x0, x0, #2 ld1 {v18.d}[0], [x4], x1 ld1 {v16.d}[0], [x4], x1 ld1 {v17.d}[0], [x4], x1 ld1 {v19.d}[0], [x4], x1 ld1 {v18.d}[1], [x4], x1 ld1 {v16.d}[1], [x4], x1 ld1 {v17.d}[1], [x4], x1 ld1 {v19.d}[1], [x4], x1 transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra st2 {v16.h,v17.h}[0], [x0], x1 st2 {v16.h,v17.h}[1], [x0], x1 st2 {v16.h,v17.h}[2], [x0], x1 st2 {v16.h,v17.h}[3], [x0], x1 st2 {v16.h,v17.h}[4], [x0], x1 st2 {v16.h,v17.h}[5], [x0], x1 st2 {v16.h,v17.h}[6], [x0], x1 st2 {v16.h,v17.h}[7], [x0], x1 ret endfunc function deblock_h_chroma_422_intra_neon, export=1 h264_loop_filter_start_intra sub x4, x0, #4 sub x0, x0, #2 ld1 {v18.d}[0], [x4], x1 ld1 {v16.d}[0], [x4], x1 ld1 {v17.d}[0], [x4], x1 ld1 {v19.d}[0], [x4], x1 ld1 {v18.d}[1], [x4], x1 ld1 {v16.d}[1], [x4], x1 ld1 {v17.d}[1], [x4], x1 ld1 {v19.d}[1], [x4], x1 transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra st2 {v16.h,v17.h}[0], [x0], x1 st2 {v16.h,v17.h}[1], [x0], x1 st2 {v16.h,v17.h}[2], [x0], x1 st2 {v16.h,v17.h}[3], [x0], x1 st2 {v16.h,v17.h}[4], [x0], x1 st2 {v16.h,v17.h}[5], [x0], x1 st2 {v16.h,v17.h}[6], [x0], x1 st2 {v16.h,v17.h}[7], [x0], x1 ld1 {v18.d}[0], [x4], x1 ld1 {v16.d}[0], [x4], x1 ld1 {v17.d}[0], [x4], x1 ld1 {v19.d}[0], [x4], x1 ld1 {v18.d}[1], [x4], x1 ld1 {v16.d}[1], [x4], x1 ld1 {v17.d}[1], [x4], x1 ld1 {v19.d}[1], [x4], x1 transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 h264_loop_filter_chroma_intra st2 {v16.h,v17.h}[0], [x0], x1 st2 {v16.h,v17.h}[1], [x0], x1 st2 {v16.h,v17.h}[2], [x0], x1 st2 {v16.h,v17.h}[3], [x0], x1 st2 {v16.h,v17.h}[4], [x0], x1 st2 {v16.h,v17.h}[5], [x0], x1 st2 {v16.h,v17.h}[6], [x0], x1 st2 {v16.h,v17.h}[7], [x0], x1 ret endfunc // void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE], // int8_t ref[2][X264_SCAN8_LUMA_SIZE], // int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], // uint8_t bs[2][8][4], int mvy_limit, // int bframe ) function deblock_strength_neon, export=1 movi v4.16b, #0 lsl w4, w4, #8 add x3, x3, #32 sub w4, w4, #(1<<8)-3 movi v5.16b, #0 dup v6.8h, w4 mov x6, #-32 bframe: // load bytes ref add x2, x2, #16 ld1 {v31.d}[1], [x1], #8 ld1 {v1.16b}, [x1], #16 movi v0.16b, #0 ld1 {v2.16b}, [x1], #16 ext v3.16b, v0.16b, v1.16b, #15 ext v0.16b, v0.16b, v2.16b, #15 unzip v21.4s, v22.4s, v1.4s, v2.4s unzip v23.4s, v20.4s, v3.4s, v0.4s ext v21.16b, v31.16b, v22.16b, #12 eor v0.16b, v20.16b, v22.16b eor v1.16b, v21.16b, v22.16b orr v4.16b, v4.16b, v0.16b orr v5.16b, v5.16b, v1.16b ld1 {v21.8h}, [x2], #16 // mv + 0x10 ld1 {v19.8h}, [x2], #16 // mv + 0x20 ld1 {v22.8h}, [x2], #16 // mv + 0x30 ld1 {v18.8h}, [x2], #16 // mv + 0x40 ld1 {v23.8h}, [x2], #16 // mv + 0x50 ext v19.16b, v19.16b, v22.16b, #12 ext v18.16b, v18.16b, v23.16b, #12 sabd v0.8h, v22.8h, v19.8h ld1 {v19.8h}, [x2], #16 // mv + 0x60 sabd v1.8h, v23.8h, v18.8h ld1 {v24.8h}, [x2], #16 // mv + 0x70 uqxtn v0.8b, v0.8h ld1 {v18.8h}, [x2], #16 // mv + 0x80 ld1 {v25.8h}, [x2], #16 // mv + 0x90 uqxtn2 v0.16b, v1.8h ext v19.16b, v19.16b, v24.16b, #12 ext v18.16b, v18.16b, v25.16b, #12 sabd v1.8h, v24.8h, v19.8h sabd v2.8h, v25.8h, v18.8h uqxtn v1.8b, v1.8h uqxtn2 v1.16b, v2.8h uqsub v0.16b, v0.16b, v6.16b uqsub v1.16b, v1.16b, v6.16b uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h sabd v1.8h, v22.8h, v23.8h orr v4.16b, v4.16b, v0.16b sabd v0.8h, v21.8h, v22.8h sabd v2.8h, v23.8h, v24.8h sabd v3.8h, v24.8h, v25.8h uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h uqxtn v1.8b, v2.8h uqxtn2 v1.16b, v3.8h uqsub v0.16b, v0.16b, v6.16b uqsub v1.16b, v1.16b, v6.16b uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h subs w5, w5, #1 orr v5.16b, v5.16b, v0.16b b.eq bframe movi v6.16b, #1 // load bytes nnz ld1 {v31.d}[1], [x0], #8 ld1 {v1.16b}, [x0], #16 movi v0.16b, #0 ld1 {v2.16b}, [x0], #16 ext v3.16b, v0.16b, v1.16b, #15 ext v0.16b, v0.16b, v2.16b, #15 unzip v21.4s, v22.4s, v1.4s, v2.4s unzip v23.4s, v20.4s, v3.4s, v0.4s ext v21.16b, v31.16b, v22.16b, #12 movrel x7, transpose_table ld1 {v7.16b}, [x7] orr v0.16b, v20.16b, v22.16b orr v1.16b, v21.16b, v22.16b umin v0.16b, v0.16b, v6.16b umin v1.16b, v1.16b, v6.16b umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0 umin v5.16b, v5.16b, v6.16b add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0 add v1.16b, v1.16b, v1.16b umax v4.16b, v4.16b, v0.16b umax v5.16b, v5.16b, v1.16b tbl v6.16b, {v4.16b}, v7.16b st1 {v5.16b}, [x3], x6 // bs[1] st1 {v6.16b}, [x3] // bs[0] ret endfunc const transpose_table .byte 0, 4, 8, 12 .byte 1, 5, 9, 13 .byte 2, 6, 10, 14 .byte 3, 7, 11, 15 endconst x264-master/common/aarch64/deblock.h000066400000000000000000000075211502133446700173600ustar00rootroot00000000000000/***************************************************************************** * deblock.h: aarch64 deblocking ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_DEBLOCK_H #define X264_AARCH64_DEBLOCK_H #define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon) void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon) void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon) void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon) void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_strength_neon x264_template(deblock_strength_neon) void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon) void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon) void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon) void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon) void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon) void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon) void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon) void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon) void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve) void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif x264-master/common/aarch64/mc-a-common.S000066400000000000000000000042371502133446700200340ustar00rootroot00000000000000/**************************************************************************** * mc-a-common.S: aarch64 motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * Mans Rullgard * Stefan Groenroos * David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ // This file contains the NEON macros and functions that are intended to be used by // the SVE/SVE2 functions as well #if BIT_DEPTH == 8 // 0 < weight < 64 .macro load_weights_add_add mov w6, w6 .endm // weight > 64 .macro load_weights_add_sub neg w7, w7 .endm // weight < 0 .macro load_weights_sub_add neg w6, w6 .endm function pixel_avg_w4_neon 1: subs w9, w9, #2 ld1 {v0.s}[0], [x2], x3 ld1 {v2.s}[0], [x4], x5 urhadd v0.8b, v0.8b, v2.8b ld1 {v1.s}[0], [x2], x3 ld1 {v3.s}[0], [x4], x5 urhadd v1.8b, v1.8b, v3.8b st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 1b ret endfunc #else // BIT_DEPTH == 10 #endif x264-master/common/aarch64/mc-a-sve.S000066400000000000000000000062331502133446700173370ustar00rootroot00000000000000/***************************************************************************** * mc-a-sve.S: aarch64 motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "mc-a-common.S" ENABLE_SVE #if BIT_DEPTH == 8 // void pixel_avg( uint8_t *dst, intptr_t dst_stride, // uint8_t *src1, intptr_t src1_stride, // uint8_t *src2, intptr_t src2_stride, int weight ); .macro AVGH_SVE w h function pixel_avg_\w\()x\h\()_sve, export=1 mov w10, #64 cmp w6, #32 mov w9, #\h b.eq pixel_avg_w\w\()_neon subs w7, w10, w6 b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64 cmp w6, #0 b.ge pixel_avg_weight_w\w\()_add_add_sve b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0 endfunc .endm AVGH_SVE 4, 2 AVGH_SVE 4, 4 AVGH_SVE 4, 8 AVGH_SVE 4, 16 // 0 < weight < 64 .macro weight_add_add_sve dst, s1, s2, h= mul \dst, \s1, v30.8h mla \dst, \s2, v31.8h .endm // weight > 64 .macro weight_add_sub_sve dst, s1, s2, h= mul \dst, \s1, v30.8h mls \dst, \s2, v31.8h .endm // weight < 0 .macro weight_sub_add_sve dst, s1, s2, h= mul \dst, \s2, v31.8h mls \dst, \s1, v30.8h .endm .macro AVG_WEIGHT_SVE ext function pixel_avg_weight_w4_\ext\()_sve load_weights_\ext ptrue p0.b, vl8 dup v30.8h, w6 dup v31.8h, w7 1: // height loop subs w9, w9, #2 ld1b {z0.h}, p0/z, [x2] add x2, x2, x3 ld1b {z1.h}, p0/z, [x4] add x4, x4, x5 weight_\ext\()_sve v4.8h, v0.8h, v1.8h ld1b {z2.h}, p0/z, [x2] add x2, x2, x3 ld1b {z3.h}, p0/z, [x4] add x4, x4, x5 sqrshrun v0.8b, v4.8h, #6 weight_\ext\()_sve v5.8h, v2.8h, v3.8h st1 {v0.s}[0], [x0], x1 sqrshrun v1.8b, v5.8h, #6 st1 {v1.s}[0], [x0], x1 b.gt 1b ret endfunc .endm AVG_WEIGHT_SVE add_add AVG_WEIGHT_SVE add_sub AVG_WEIGHT_SVE sub_add #else // BIT_DEPTH == 10 #endif x264-master/common/aarch64/mc-a.S000066400000000000000000003406271502133446700165540ustar00rootroot00000000000000/***************************************************************************** * mc.S: aarch64 motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * Mans Rullgard * Stefan Groenroos * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "mc-a-common.S" // note: prefetch stuff assumes 64-byte cacheline // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) function prefetch_ref_aarch64, export=1 cmp w2, #1 csel x2, xzr, x1, eq add x0, x0, #64 add x0, x0, x2, lsl #3 lsl x2, x1, #1 add x3, x1, x1, lsl #1 add x4, x0, x1, lsl #2 prfm pldl1strm, [x0] prfm pldl1strm, [x0, x1] prfm pldl1strm, [x0, x2] prfm pldl1strm, [x0, x3] prfm pldl1strm, [x4] prfm pldl1strm, [x4, x1] prfm pldl1strm, [x4, x2] prfm pldl1strm, [x4, x3] ret endfunc // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, // uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) .macro prefetch_fenc sub function prefetch_fenc_\sub\()_aarch64, export=1 and w6, w5, #3 and w7, w5, #3 mul x6, x6, x1 mul x7, x7, x3 add x0, x0, #64 add x2, x2, #64 add x0, x0, x6, lsl #2 add x6, x0, x1, lsl #1 prfm pldl1strm, [x0] prfm pldl1strm, [x0, x1] prfm pldl1strm, [x6] prfm pldl1strm, [x6, x1] add x2, x2, x7, lsl #1 prfm pldl1strm, [x2] prfm pldl1strm, [x2, x3] .ifc \sub, 422 add x7, x2, x3, lsl #1 prfm pldl1strm, [x7] prfm pldl1strm, [x7, x3] .endif ret endfunc .endm prefetch_fenc 420 prefetch_fenc 422 function mbtree_propagate_cost_neon, export=1 ld1r {v5.4s}, [x5] 8: subs w6, w6, #8 ld1 {v1.8h}, [x1], #16 ld1 {v2.8h}, [x2], #16 ld1 {v3.8h}, [x3], #16 ld1 {v4.8h}, [x4], #16 bic v3.8h, #0xc0, lsl #8 umin v3.8h, v2.8h, v3.8h umull v20.4s, v2.4h, v4.4h // propagate_intra umull2 v21.4s, v2.8h, v4.8h // propagate_intra usubl v22.4s, v2.4h, v3.4h // propagate_num usubl2 v23.4s, v2.8h, v3.8h // propagate_num uxtl v26.4s, v2.4h // propagate_denom uxtl2 v27.4s, v2.8h // propagate_denom uxtl v24.4s, v1.4h uxtl2 v25.4s, v1.8h ucvtf v20.4s, v20.4s ucvtf v21.4s, v21.4s ucvtf v26.4s, v26.4s ucvtf v27.4s, v27.4s ucvtf v22.4s, v22.4s ucvtf v23.4s, v23.4s frecpe v28.4s, v26.4s frecpe v29.4s, v27.4s ucvtf v24.4s, v24.4s ucvtf v25.4s, v25.4s frecps v30.4s, v28.4s, v26.4s frecps v31.4s, v29.4s, v27.4s fmla v24.4s, v20.4s, v5.4s // propagate_amount fmla v25.4s, v21.4s, v5.4s // propagate_amount fmul v28.4s, v28.4s, v30.4s fmul v29.4s, v29.4s, v31.4s fmul v16.4s, v24.4s, v22.4s fmul v17.4s, v25.4s, v23.4s fmul v18.4s, v16.4s, v28.4s fmul v19.4s, v17.4s, v29.4s fcvtns v20.4s, v18.4s fcvtns v21.4s, v19.4s sqxtn v0.4h, v20.4s sqxtn2 v0.8h, v21.4s st1 {v0.8h}, [x0], #16 b.gt 8b ret endfunc const pw_0to15, align=5 .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 endconst function mbtree_propagate_list_internal_neon, export=1 movrel x11, pw_0to15 dup v31.8h, w4 // bipred_weight movi v30.8h, #0xc0, lsl #8 ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y movi v28.4s, #4 movi v27.8h, #31 movi v26.8h, #32 dup v24.8h, w5 // mb_y zip1 v29.8h, v29.8h, v24.8h 8: subs w6, w6, #8 ld1 {v1.8h}, [x1], #16 // propagate_amount ld1 {v2.8h}, [x2], #16 // lowres_cost and v2.16b, v2.16b, v30.16b cmeq v25.8h, v2.8h, v30.8h umull v16.4s, v1.4h, v31.4h umull2 v17.4s, v1.8h, v31.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 ) // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 ld1 {v4.8h,v5.8h}, [x0], #32 sshr v6.8h, v4.8h, #5 sshr v7.8h, v5.8h, #5 add v6.8h, v6.8h, v29.8h add v29.8h, v29.8h, v28.8h add v7.8h, v7.8h, v29.8h add v29.8h, v29.8h, v28.8h st1 {v6.8h,v7.8h}, [x3], #32 and v4.16b, v4.16b, v27.16b and v5.16b, v5.16b, v27.16b uzp1 v6.8h, v4.8h, v5.8h // x & 31 uzp2 v7.8h, v4.8h, v5.8h // y & 31 sub v4.8h, v26.8h, v6.8h // 32 - (x & 31) sub v5.8h, v26.8h, v7.8h // 32 - (y & 31) mul v19.8h, v6.8h, v7.8h // idx3weight = y*x; mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x); mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x; mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ; umull v6.4s, v19.4h, v25.4h umull2 v7.4s, v19.8h, v25.8h umull v4.4s, v18.4h, v25.4h umull2 v5.4s, v18.8h, v25.8h umull v2.4s, v17.4h, v25.4h umull2 v3.4s, v17.8h, v25.8h umull v0.4s, v16.4h, v25.4h umull2 v1.4s, v16.8h, v25.8h rshrn v19.4h, v6.4s, #10 rshrn2 v19.8h, v7.4s, #10 rshrn v18.4h, v4.4s, #10 rshrn2 v18.8h, v5.4s, #10 rshrn v17.4h, v2.4s, #10 rshrn2 v17.8h, v3.4s, #10 rshrn v16.4h, v0.4s, #10 rshrn2 v16.8h, v1.4s, #10 zip1 v0.8h, v16.8h, v17.8h zip2 v1.8h, v16.8h, v17.8h zip1 v2.8h, v18.8h, v19.8h zip2 v3.8h, v18.8h, v19.8h st1 {v0.8h,v1.8h}, [x3], #32 st1 {v2.8h,v3.8h}, [x3], #32 b.ge 8b ret endfunc function memcpy_aligned_neon, export=1 tst x2, #16 b.eq 32f sub x2, x2, #16 ldr q0, [x1], #16 str q0, [x0], #16 32: tst x2, #32 b.eq 640f sub x2, x2, #32 ldp q0, q1, [x1], #32 stp q0, q1, [x0], #32 640: cbz x2, 1f 64: subs x2, x2, #64 ldp q0, q1, [x1, #32] ldp q2, q3, [x1], #64 stp q0, q1, [x0, #32] stp q2, q3, [x0], #64 b.gt 64b 1: ret endfunc function memzero_aligned_neon, export=1 movi v0.16b, #0 movi v1.16b, #0 1: subs x1, x1, #128 stp q0, q1, [x0, #96] stp q0, q1, [x0, #64] stp q0, q1, [x0, #32] stp q0, q1, [x0], 128 b.gt 1b ret endfunc // void mbtree_fix8_pack( int16_t *dst, float *src, int count ) function mbtree_fix8_pack_neon, export=1 subs w3, w2, #8 b.lt 2f 1: subs w3, w3, #8 ld1 {v0.4s,v1.4s}, [x1], #32 fcvtzs v0.4s, v0.4s, #8 fcvtzs v1.4s, v1.4s, #8 sqxtn v2.4h, v0.4s sqxtn2 v2.8h, v1.4s rev16 v3.16b, v2.16b st1 {v3.8h}, [x0], #16 b.ge 1b 2: adds w3, w3, #8 b.eq 4f 3: subs w3, w3, #1 ldr s0, [x1], #4 fcvtzs w4, s0, #8 rev16 w5, w4 strh w5, [x0], #2 b.gt 3b 4: ret endfunc // void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) function mbtree_fix8_unpack_neon, export=1 subs w3, w2, #8 b.lt 2f 1: subs w3, w3, #8 ld1 {v0.8h}, [x1], #16 rev16 v1.16b, v0.16b sxtl v2.4s, v1.4h sxtl2 v3.4s, v1.8h scvtf v4.4s, v2.4s, #8 scvtf v5.4s, v3.4s, #8 st1 {v4.4s,v5.4s}, [x0], #32 b.ge 1b 2: adds w3, w3, #8 b.eq 4f 3: subs w3, w3, #1 ldrh w4, [x1], #2 rev16 w5, w4 sxth w6, w5 scvtf s0, w6, #8 str s0, [x0], #4 b.gt 3b 4: ret endfunc #if BIT_DEPTH == 8 // void pixel_avg( uint8_t *dst, intptr_t dst_stride, // uint8_t *src1, intptr_t src1_stride, // uint8_t *src2, intptr_t src2_stride, int weight ); .macro AVGH w h function pixel_avg_\w\()x\h\()_neon, export=1 mov w10, #64 cmp w6, #32 mov w9, #\h b.eq pixel_avg_w\w\()_neon subs w7, w10, w6 b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 cmp w6, #0 b.ge pixel_avg_weight_w\w\()_add_add_neon b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 endfunc .endm AVGH 4, 2 AVGH 4, 4 AVGH 4, 8 AVGH 4, 16 AVGH 8, 4 AVGH 8, 8 AVGH 8, 16 AVGH 16, 8 AVGH 16, 16 // 0 < weight < 64 .macro weight_add_add dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s1, v30.16b umlal2 \dst, \s2, v31.16b .else umull \dst, \s1, v30.8b umlal \dst, \s2, v31.8b .endif .endm // weight > 64 .macro weight_add_sub dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s1, v30.16b umlsl2 \dst, \s2, v31.16b .else umull \dst, \s1, v30.8b umlsl \dst, \s2, v31.8b .endif .endm // weight < 0 .macro weight_sub_add dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s2, v31.16b umlsl2 \dst, \s1, v30.16b .else umull \dst, \s2, v31.8b umlsl \dst, \s1, v30.8b .endif .endm .macro AVG_WEIGHT ext function pixel_avg_weight_w4_\ext\()_neon load_weights_\ext dup v30.8b, w6 dup v31.8b, w7 1: // height loop subs w9, w9, #2 ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x4], x5 weight_\ext v4.8h, v0.8b, v1.8b ld1 {v2.s}[0], [x2], x3 ld1 {v3.s}[0], [x4], x5 sqrshrun v0.8b, v4.8h, #6 weight_\ext v5.8h, v2.8b, v3.8b st1 {v0.s}[0], [x0], x1 sqrshrun v1.8b, v5.8h, #6 st1 {v1.s}[0], [x0], x1 b.gt 1b ret endfunc function pixel_avg_weight_w8_\ext\()_neon load_weights_\ext dup v30.8b, w6 dup v31.8b, w7 1: // height loop subs w9, w9, #4 ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x4], x5 weight_\ext v16.8h, v0.8b, v1.8b ld1 {v2.8b}, [x2], x3 ld1 {v3.8b}, [x4], x5 weight_\ext v17.8h, v2.8b, v3.8b ld1 {v4.8b}, [x2], x3 ld1 {v5.8b}, [x4], x5 weight_\ext v18.8h, v4.8b, v5.8b ld1 {v6.8b}, [x2], x3 ld1 {v7.8b}, [x4], x5 weight_\ext v19.8h, v6.8b, v7.8b sqrshrun v0.8b, v16.8h, #6 sqrshrun v1.8b, v17.8h, #6 sqrshrun v2.8b, v18.8h, #6 sqrshrun v3.8b, v19.8h, #6 st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function pixel_avg_weight_w16_\ext\()_neon load_weights_\ext dup v30.16b, w6 dup v31.16b, w7 1: // height loop subs w9, w9, #2 ld1 {v0.16b}, [x2], x3 ld1 {v1.16b}, [x4], x5 weight_\ext v16.8h, v0.8b, v1.8b weight_\ext v17.8h, v0.16b, v1.16b, 2 ld1 {v2.16b}, [x2], x3 ld1 {v3.16b}, [x4], x5 weight_\ext v18.8h, v2.8b, v3.8b weight_\ext v19.8h, v2.16b, v3.16b, 2 sqrshrun v0.8b, v16.8h, #6 sqrshrun v1.8b, v18.8h, #6 sqrshrun2 v0.16b, v17.8h, #6 sqrshrun2 v1.16b, v19.8h, #6 st1 {v0.16b}, [x0], x1 st1 {v1.16b}, [x0], x1 b.gt 1b ret endfunc .endm AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add function pixel_avg_w8_neon 1: subs w9, w9, #4 ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x4], x5 ld1 {v2.8b}, [x2], x3 urhadd v0.8b, v0.8b, v1.8b ld1 {v3.8b}, [x4], x5 st1 {v0.8b}, [x0], x1 ld1 {v4.8b}, [x2], x3 urhadd v1.8b, v2.8b, v3.8b ld1 {v5.8b}, [x4], x5 st1 {v1.8b}, [x0], x1 ld1 {v6.8b}, [x2], x3 ld1 {v7.8b}, [x4], x5 urhadd v2.8b, v4.8b, v5.8b urhadd v3.8b, v6.8b, v7.8b st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function pixel_avg_w16_neon 1: subs w9, w9, #4 ld1 {v0.16b}, [x2], x3 ld1 {v1.16b}, [x4], x5 ld1 {v2.16b}, [x2], x3 urhadd v0.16b, v0.16b, v1.16b ld1 {v3.16b}, [x4], x5 st1 {v0.16b}, [x0], x1 ld1 {v4.16b}, [x2], x3 urhadd v1.16b, v2.16b, v3.16b ld1 {v5.16b}, [x4], x5 st1 {v1.16b}, [x0], x1 ld1 {v6.16b}, [x2], x3 ld1 {v7.16b}, [x4], x5 urhadd v2.16b, v4.16b, v5.16b urhadd v3.16b, v6.16b, v7.16b st1 {v2.16b}, [x0], x1 st1 {v3.16b}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w4_neon, export=1 1: subs w5, w5, #2 ld1 {v0.s}[0], [x2], x3 ld1 {v2.s}[0], [x4], x3 urhadd v0.8b, v0.8b, v2.8b ld1 {v1.s}[0], [x2], x3 ld1 {v3.s}[0], [x4], x3 urhadd v1.8b, v1.8b, v3.8b st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w8_neon, export=1 1: subs w5, w5, #2 ld1 {v0.8b}, [x2], x3 ld1 {v2.8b}, [x4], x3 urhadd v0.8b, v0.8b, v2.8b ld1 {v1.8b}, [x2], x3 ld1 {v3.8b}, [x4], x3 urhadd v1.8b, v1.8b, v3.8b st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w16_neon, export=1 1: subs w5, w5, #2 ld1 {v0.16b}, [x2], x3 ld1 {v2.16b}, [x4], x3 urhadd v0.16b, v0.16b, v2.16b ld1 {v1.16b}, [x2], x3 ld1 {v3.16b}, [x4], x3 urhadd v1.16b, v1.16b, v3.16b st1 {v0.16b}, [x0], x1 st1 {v1.16b}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w20_neon, export=1 sub x1, x1, #16 1: subs w5, w5, #2 ld1 {v0.16b,v1.16b}, [x2], x3 ld1 {v2.16b,v3.16b}, [x4], x3 urhadd v0.16b, v0.16b, v2.16b urhadd v1.8b, v1.8b, v3.8b ld1 {v4.16b,v5.16b}, [x2], x3 ld1 {v6.16b,v7.16b}, [x4], x3 urhadd v4.16b, v4.16b, v6.16b urhadd v5.8b, v5.8b, v7.8b st1 {v0.16b}, [x0], #16 st1 {v1.s}[0], [x0], x1 st1 {v4.16b}, [x0], #16 st1 {v5.s}[0], [x0], x1 b.gt 1b ret endfunc .macro weight_prologue type mov w9, w5 // height .ifc \type, full ldr w12, [x4, #32] // denom .endif ldp w4, w5, [x4, #32+4] // scale, offset dup v0.16b, w4 dup v1.8h, w5 .ifc \type, full neg w12, w12 dup v2.8h, w12 .endif .endm // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, // intptr_t dst_stride, const x264_weight_t *weight, int h ) function mc_weight_w20_neon, export=1 weight_prologue full sub x1, x1, #16 1: subs w9, w9, #2 ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 umull v22.8h, v16.8b, v0.8b umull v23.8h, v17.8b, v0.8b zip1 v18.2s, v18.2s, v21.2s umull v25.8h, v19.8b, v0.8b umull v26.8h, v20.8b, v0.8b umull v24.8h, v18.8b, v0.8b srshl v22.8h, v22.8h, v2.8h srshl v23.8h, v23.8h, v2.8h srshl v24.8h, v24.8h, v2.8h srshl v25.8h, v25.8h, v2.8h srshl v26.8h, v26.8h, v2.8h add v22.8h, v22.8h, v1.8h add v23.8h, v23.8h, v1.8h add v24.8h, v24.8h, v1.8h add v25.8h, v25.8h, v1.8h add v26.8h, v26.8h, v1.8h sqxtun v4.8b, v22.8h sqxtun2 v4.16b, v23.8h sqxtun v6.8b, v24.8h sqxtun v5.8b, v25.8h sqxtun2 v5.16b, v26.8h st1 {v4.16b}, [x0], #16 st1 {v6.s}[0], [x0], x1 st1 {v5.16b}, [x0], #16 st1 {v6.s}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w16_neon, export=1 weight_prologue full weight16_loop: 1: subs w9, w9, #2 ld1 {v4.16b}, [x2], x3 ld1 {v5.16b}, [x2], x3 umull v22.8h, v4.8b, v0.8b umull2 v23.8h, v4.16b, v0.16b umull v24.8h, v5.8b, v0.8b umull2 v25.8h, v5.16b, v0.16b srshl v22.8h, v22.8h, v2.8h srshl v23.8h, v23.8h, v2.8h srshl v24.8h, v24.8h, v2.8h srshl v25.8h, v25.8h, v2.8h add v22.8h, v22.8h, v1.8h add v23.8h, v23.8h, v1.8h add v24.8h, v24.8h, v1.8h add v25.8h, v25.8h, v1.8h sqxtun v4.8b, v22.8h sqxtun2 v4.16b, v23.8h sqxtun v5.8b, v24.8h sqxtun2 v5.16b, v25.8h st1 {v4.16b}, [x0], x1 st1 {v5.16b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w8_neon, export=1 weight_prologue full 1: subs w9, w9, #2 ld1 {v16.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3 umull v4.8h, v16.8b, v0.8b umull v5.8h, v17.8b, v0.8b srshl v4.8h, v4.8h, v2.8h srshl v5.8h, v5.8h, v2.8h add v4.8h, v4.8h, v1.8h add v5.8h, v5.8h, v1.8h sqxtun v16.8b, v4.8h sqxtun v17.8b, v5.8h st1 {v16.8b}, [x0], x1 st1 {v17.8b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w4_neon, export=1 weight_prologue full 1: subs w9, w9, #2 ld1 {v16.s}[0], [x2], x3 ld1 {v16.s}[1], [x2], x3 umull v4.8h, v16.8b, v0.8b srshl v4.8h, v4.8h, v2.8h add v4.8h, v4.8h, v1.8h sqxtun v16.8b, v4.8h st1 {v16.s}[0], [x0], x1 st1 {v16.s}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w20_nodenom_neon, export=1 weight_prologue nodenom sub x1, x1, #16 1: subs w9, w9, #2 ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 mov v27.16b, v1.16b mov v28.16b, v1.16b ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 mov v31.16b, v1.16b mov v29.16b, v1.16b mov v30.16b, v1.16b zip1 v18.2s, v18.2s, v21.2s umlal v27.8h, v16.8b, v0.8b umlal v28.8h, v17.8b, v0.8b umlal v31.8h, v18.8b, v0.8b umlal v29.8h, v19.8b, v0.8b umlal v30.8h, v20.8b, v0.8b sqxtun v4.8b, v27.8h sqxtun2 v4.16b, v28.8h sqxtun v5.8b, v29.8h sqxtun2 v5.16b, v30.8h sqxtun v6.8b, v31.8h st1 {v4.16b}, [x0], #16 st1 {v6.s}[0], [x0], x1 st1 {v5.16b}, [x0], #16 st1 {v6.s}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w16_nodenom_neon, export=1 weight_prologue nodenom 1: subs w9, w9, #2 ld1 {v6.16b}, [x2], x3 mov v27.16b, v1.16b mov v28.16b, v1.16b ld1 {v7.16b}, [x2], x3 mov v29.16b, v1.16b mov v30.16b, v1.16b umlal v27.8h, v6.8b, v0.8b umlal2 v28.8h, v6.16b, v0.16b umlal v29.8h, v7.8b, v0.8b umlal2 v30.8h, v7.16b, v0.16b sqxtun v4.8b, v27.8h sqxtun2 v4.16b, v28.8h sqxtun v5.8b, v29.8h sqxtun2 v5.16b, v30.8h st1 {v4.16b}, [x0], x1 st1 {v5.16b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w8_nodenom_neon, export=1 weight_prologue nodenom 1: subs w9, w9, #2 ld1 {v16.8b}, [x2], x3 mov v27.16b, v1.16b ld1 {v17.8b}, [x2], x3 mov v29.16b, v1.16b umlal v27.8h, v16.8b, v0.8b umlal v29.8h, v17.8b, v0.8b sqxtun v4.8b, v27.8h sqxtun v5.8b, v29.8h st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w4_nodenom_neon, export=1 weight_prologue nodenom 1: subs w9, w9, #2 ld1 {v16.s}[0], [x2], x3 ld1 {v16.s}[1], [x2], x3 mov v27.16b, v1.16b umlal v27.8h, v16.8b, v0.8b sqxtun v4.8b, v27.8h st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x0], x1 b.gt 1b ret endfunc .macro weight_simple_prologue ldr w6, [x4] // offset dup v1.16b, w6 .endm .macro weight_simple name op function mc_weight_w20_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 ldr s18, [x2, #16] ld1 {v16.16b}, [x2], x3 ldr s19, [x2, #16] ld1 {v17.16b}, [x2], x3 \op v18.8b, v18.8b, v1.8b \op v16.16b, v16.16b, v1.16b \op v19.8b, v19.8b, v1.8b \op v17.16b, v17.16b, v1.16b str s18, [x0, #16] st1 {v16.16b}, [x0], x1 str s19, [x0, #16] st1 {v17.16b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w16_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 ld1 {v16.16b}, [x2], x3 ld1 {v17.16b}, [x2], x3 \op v16.16b, v16.16b, v1.16b \op v17.16b, v17.16b, v1.16b st1 {v16.16b}, [x0], x1 st1 {v17.16b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w8_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 ld1 {v16.8b}, [x2], x3 ld1 {v17.8b}, [x2], x3 \op v16.8b, v16.8b, v1.8b \op v17.8b, v17.8b, v1.8b st1 {v16.8b}, [x0], x1 st1 {v17.8b}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w4_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 ld1 {v16.s}[0], [x2], x3 ld1 {v16.s}[1], [x2], x3 \op v16.8b, v16.8b, v1.8b st1 {v16.s}[0], [x0], x1 st1 {v16.s}[1], [x0], x1 b.gt 1b ret endfunc .endm weight_simple offsetadd, uqadd weight_simple offsetsub, uqsub // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) function mc_copy_w4_neon, export=1 1: subs w4, w4, #4 ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 ld1 {v2.s}[0], [x2], x3 ld1 {v3.s}[0], [x2], x3 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 st1 {v2.s}[0], [x0], x1 st1 {v3.s}[0], [x0], x1 b.gt 1b ret endfunc function mc_copy_w8_neon, export=1 1: subs w4, w4, #4 ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x2], x3 ld1 {v2.8b}, [x2], x3 ld1 {v3.8b}, [x2], x3 st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function mc_copy_w16_neon, export=1 1: subs w4, w4, #4 ld1 {v0.16b}, [x2], x3 ld1 {v1.16b}, [x2], x3 ld1 {v2.16b}, [x2], x3 ld1 {v3.16b}, [x2], x3 st1 {v0.16b}, [x0], x1 st1 {v1.16b}, [x0], x1 st1 {v2.16b}, [x0], x1 st1 {v3.16b}, [x0], x1 b.gt 1b ret endfunc // void mc_chroma( uint8_t *dst_u, uint8_t *dst_v, // intptr_t i_dst_stride, // uint8_t *src, intptr_t i_src_stride, // int dx, int dy, int i_width, int i_height ); function mc_chroma_neon, export=1 ldr w15, [sp] // height sbfx x12, x6, #3, #29 // asr(3) and sign extend sbfx x11, x5, #3, #29 // asr(3) and sign extend cmp w7, #4 mul x12, x12, x4 add x3, x3, x11, lsl #1 and w5, w5, #7 and w6, w6, #7 add x3, x3, x12 //pld [x3] //pld [x3, x4] b.gt mc_chroma_w8_neon b.eq mc_chroma_w4_neon endfunc .macro CHROMA_MC_START r00, r01, r10, r11 mul w12, w5, w6 // cD = d8x *d8y lsl w13, w5, #3 add w9, w12, #64 lsl w14, w6, #3 tst w12, w12 sub w9, w9, w13 sub w10, w13, w12 // cB = d8x *(8-d8y); sub w11, w14, w12 // cC = (8-d8x)*d8y sub w9, w9, w14 // cA = (8-d8x)*(8-d8y); .endm .macro CHROMA_MC width, vsize function mc_chroma_w\width\()_neon // since the element size varies, there's a different index for the 2nd store .if \width == 4 .set idx2, 1 .else .set idx2, 2 .endif CHROMA_MC_START b.eq 2f ld2 {v28.8b,v29.8b}, [x3], x4 dup v0.8b, w9 // cA dup v1.8b, w10 // cB ext v6.8b, v28.8b, v6.8b, #1 ext v7.8b, v29.8b, v7.8b, #1 ld2 {v30.8b,v31.8b}, [x3], x4 dup v2.8b, w11 // cC dup v3.8b, w12 // cD ext v22.8b, v30.8b, v22.8b, #1 ext v23.8b, v31.8b, v23.8b, #1 trn1 v0.2s, v0.2s, v1.2s trn1 v2.2s, v2.2s, v3.2s trn1 v4.2s, v28.2s, v6.2s trn1 v5.2s, v29.2s, v7.2s trn1 v20.2s, v30.2s, v22.2s trn1 v21.2s, v31.2s, v23.2s 1: // height loop, interpolate xy subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b umlal v16.8h, v20.8b, v2.8b umull v17.8h, v5.8b, v0.8b umlal v17.8h, v21.8b, v2.8b ld2 {v28.8b,v29.8b}, [x3], x4 transpose v24.2d, v25.2d, v16.2d, v17.2d ext v6.8b, v28.8b, v6.8b, #1 ext v7.8b, v29.8b, v7.8b, #1 trn1 v4.2s, v28.2s, v6.2s trn1 v5.2s, v29.2s, v7.2s add v16.8h, v24.8h, v25.8h umull v18.8h, v20.8b, v0.8b umlal v18.8h, v4.8b, v2.8b umull v19.8h, v21.8b, v0.8b umlal v19.8h, v5.8b, v2.8b ld2 {v30.8b,v31.8b}, [x3], x4 transpose v26.2d, v27.2d, v18.2d, v19.2d ext v22.8b, v30.8b, v22.8b, #1 ext v23.8b, v31.8b, v23.8b, #1 trn1 v20.2s, v30.2s, v22.2s trn1 v21.2s, v31.2s, v23.2s add v17.8h, v26.8h, v27.8h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 //pld [x3] //pld [x3, x4] st1 {v16.\vsize}[0], [x0], x2 st1 {v16.\vsize}[idx2], [x1], x2 st1 {v17.\vsize}[0], [x0], x2 st1 {v17.\vsize}[idx2], [x1], x2 b.gt 1b ret 2: // dx or dy are 0 tst w11, w11 add w10, w10, w11 dup v0.8b, w9 dup v1.8b, w10 b.eq 4f ld1 {v4.8b}, [x3], x4 ld1 {v6.8b}, [x3], x4 3: // vertical interpolation loop subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b ld1 {v4.8b}, [x3], x4 umlal v16.8h, v6.8b, v1.8b umull v17.8h, v6.8b, v0.8b ld1 {v6.8b}, [x3], x4 umlal v17.8h, v4.8b, v1.8b rshrn v20.8b, v16.8h, #6 // uvuvuvuv rshrn v21.8b, v17.8h, #6 // uvuvuvuv uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv //pld [x3] //pld [x3, x4] st1 {v16.\vsize}[0], [x0], x2 st1 {v16.\vsize}[idx2], [x0], x2 st1 {v17.\vsize}[0], [x1], x2 st1 {v17.\vsize}[idx2], [x1], x2 b.gt 3b ret 4: // dy is 0 ld1 {v4.8b,v5.8b}, [x3], x4 ld1 {v6.8b,v7.8b}, [x3], x4 ext v5.8b, v4.8b, v5.8b, #2 ext v7.8b, v6.8b, v7.8b, #2 5: // horizontal interpolation loop subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b umlal v16.8h, v5.8b, v1.8b umull v17.8h, v6.8b, v0.8b umlal v17.8h, v7.8b, v1.8b ld1 {v4.8b,v5.8b}, [x3], x4 ld1 {v6.8b,v7.8b}, [x3], x4 rshrn v20.8b, v16.8h, #6 rshrn v21.8b, v17.8h, #6 ext v5.8b, v4.8b, v5.8b, #2 ext v7.8b, v6.8b, v7.8b, #2 uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv //pld [x3] //pld [x3, x4] st1 {v16.\vsize}[0], [x0], x2 st1 {v16.\vsize}[idx2], [x0], x2 st1 {v17.\vsize}[0], [x1], x2 st1 {v17.\vsize}[idx2], [x1], x2 b.gt 5b ret endfunc .endm CHROMA_MC 2, h CHROMA_MC 4, s function mc_chroma_w8_neon CHROMA_MC_START b.eq 2f ld2 {v4.16b,v5.16b}, [x3], x4 ld2 {v20.16b,v21.16b}, [x3], x4 dup v0.8b, w9 // cA dup v1.8b, w10 // cB ext v6.16b, v4.16b, v4.16b, #1 ext v7.16b, v5.16b, v5.16b, #1 dup v2.8b, w11 // cC dup v3.8b, w12 // cD ext v22.16b, v20.16b, v20.16b, #1 ext v23.16b, v21.16b, v21.16b, #1 1: // height loop, interpolate xy subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b umlal v16.8h, v6.8b, v1.8b umlal v16.8h, v20.8b, v2.8b umlal v16.8h, v22.8b, v3.8b umull v17.8h, v5.8b, v0.8b umlal v17.8h, v7.8b, v1.8b umlal v17.8h, v21.8b, v2.8b umlal v17.8h, v23.8b, v3.8b ld2 {v4.16b,v5.16b}, [x3], x4 ext v6.16b, v4.16b, v4.16b, #1 ext v7.16b, v5.16b, v5.16b, #1 umull v18.8h, v20.8b, v0.8b umlal v18.8h, v22.8b, v1.8b umlal v18.8h, v4.8b, v2.8b umlal v18.8h, v6.8b, v3.8b umull v19.8h, v21.8b, v0.8b umlal v19.8h, v23.8b, v1.8b umlal v19.8h, v5.8b, v2.8b umlal v19.8h, v7.8b, v3.8b ld2 {v20.16b,v21.16b}, [x3], x4 rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 rshrn v18.8b, v18.8h, #6 rshrn v19.8b, v19.8h, #6 ext v22.16b, v20.16b, v20.16b, #1 ext v23.16b, v21.16b, v21.16b, #1 //pld [x3] //pld [x3, x4] st1 {v16.8b}, [x0], x2 st1 {v17.8b}, [x1], x2 st1 {v18.8b}, [x0], x2 st1 {v19.8b}, [x1], x2 b.gt 1b ret 2: // dx or dy are 0 tst w11, w11 add w10, w10, w11 dup v0.8b, w9 dup v1.8b, w10 b.eq 4f ld2 {v4.8b,v5.8b}, [x3], x4 ld2 {v6.8b,v7.8b}, [x3], x4 3: // vertical interpolation loop subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b //U umlal v16.8h, v6.8b, v1.8b umull v17.8h, v5.8b, v0.8b //V umlal v17.8h, v7.8b, v1.8b ld2 {v4.8b,v5.8b}, [x3], x4 umull v18.8h, v6.8b, v0.8b umlal v18.8h, v4.8b, v1.8b umull v19.8h, v7.8b, v0.8b umlal v19.8h, v5.8b, v1.8b ld2 {v6.8b,v7.8b}, [x3], x4 rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 rshrn v18.8b, v18.8h, #6 rshrn v19.8b, v19.8h, #6 //pld [x3] //pld [x3, x4] st1 {v16.8b}, [x0], x2 st1 {v17.8b}, [x1], x2 st1 {v18.8b}, [x0], x2 st1 {v19.8b}, [x1], x2 b.gt 3b ret 4: // dy is 0 ld2 {v4.16b,v5.16b}, [x3], x4 ext v6.16b, v4.16b, v4.16b, #1 ext v7.16b, v5.16b, v5.16b, #1 ld2 {v20.16b,v21.16b}, [x3], x4 ext v22.16b, v20.16b, v20.16b, #1 ext v23.16b, v21.16b, v21.16b, #1 5: // horizontal interpolation loop subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b //U umlal v16.8h, v6.8b, v1.8b umull v17.8h, v5.8b, v0.8b //V umlal v17.8h, v7.8b, v1.8b ld2 {v4.16b,v5.16b}, [x3], x4 umull v18.8h, v20.8b, v0.8b umlal v18.8h, v22.8b, v1.8b umull v19.8h, v21.8b, v0.8b umlal v19.8h, v23.8b, v1.8b ld2 {v20.16b,v21.16b}, [x3], x4 rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 rshrn v18.8b, v18.8h, #6 rshrn v19.8b, v19.8h, #6 ext v6.16b, v4.16b, v4.16b, #1 ext v7.16b, v5.16b, v5.16b, #1 ext v22.16b, v20.16b, v20.16b, #1 ext v23.16b, v21.16b, v21.16b, #1 //pld [x3] //pld [x3, x4] st1 {v16.8b}, [x0], x2 st1 {v17.8b}, [x1], x2 st1 {v18.8b}, [x0], x2 st1 {v19.8b}, [x1], x2 b.gt 5b ret endfunc // void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, // intptr_t stride, int width, int height, int16_t *buf ) function hpel_filter_neon, export=1 ubfm x9, x3, #0, #3 add w15, w5, w9 sub x13, x3, x9 // align src sub x10, x0, x9 sub x11, x1, x9 sub x12, x2, x9 movi v30.16b, #5 movi v31.16b, #20 1: // line start mov x3, x13 mov x2, x12 mov x1, x11 mov x0, x10 add x7, x3, #16 // src pointer next 16b for horiz filter mov x5, x15 // restore width sub x3, x3, x4, lsl #1 // src - 2*stride ld1 {v28.16b}, [x7], #16 // src[16:31] add x9, x3, x5 // holds src - 2*stride + width ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] ext v22.16b, v7.16b, v18.16b, #14 uaddl v1.8h, v16.8b, v21.8b ext v26.16b, v18.16b, v28.16b, #3 umlsl v1.8h, v17.8b, v30.8b ext v23.16b, v7.16b, v18.16b, #15 umlal v1.8h, v18.8b, v31.8b ext v24.16b, v18.16b, v28.16b, #1 umlal v1.8h, v19.8b, v31.8b ext v25.16b, v18.16b, v28.16b, #2 umlsl v1.8h, v20.8b, v30.8b 2: // next 16 pixel of line subs x5, x5, #16 sub x3, x9, x5 // src - 2*stride += 16 uaddl v4.8h, v22.8b, v26.8b uaddl2 v5.8h, v22.16b, v26.16b sqrshrun v6.8b, v1.8h, #5 umlsl v4.8h, v23.8b, v30.8b umlsl2 v5.8h, v23.16b, v30.16b umlal v4.8h, v18.8b, v31.8b umlal2 v5.8h, v18.16b, v31.16b umlal v4.8h, v24.8b, v31.8b umlal2 v5.8h, v24.16b, v31.16b umlsl v4.8h, v25.8b, v30.8b umlsl2 v5.8h, v25.16b, v30.16b uaddl2 v2.8h, v16.16b, v21.16b sqrshrun v4.8b, v4.8h, #5 mov v7.16b, v18.16b sqrshrun2 v4.16b, v5.8h, #5 umlsl2 v2.8h, v17.16b, v30.16b ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] umlal2 v2.8h, v18.16b, v31.16b ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] umlal2 v2.8h, v19.16b, v31.16b ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] umlsl2 v2.8h, v20.16b, v30.16b ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] st1 {v4.16b}, [x0], #16 sqrshrun2 v6.16b, v2.8h, #5 ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] ext v22.16b, v0.16b, v1.16b, #12 ext v26.16b, v1.16b, v2.16b, #6 ext v23.16b, v0.16b, v1.16b, #14 st1 {v6.16b}, [x1], #16 uaddl v3.8h, v16.8b, v21.8b ext v25.16b, v1.16b, v2.16b, #4 umlsl v3.8h, v17.8b, v30.8b ext v24.16b, v1.16b, v2.16b, #2 umlal v3.8h, v18.8b, v31.8b add v4.8h, v22.8h, v26.8h umlal v3.8h, v19.8b, v31.8b add v5.8h, v23.8h, v25.8h umlsl v3.8h, v20.8b, v30.8b add v6.8h, v24.8h, v1.8h ext v22.16b, v1.16b, v2.16b, #12 ext v26.16b, v2.16b, v3.16b, #6 ext v23.16b, v1.16b, v2.16b, #14 ext v25.16b, v2.16b, v3.16b, #4 ext v24.16b, v2.16b, v3.16b, #2 add v22.8h, v22.8h, v26.8h add v23.8h, v23.8h, v25.8h add v24.8h, v24.8h, v2.8h sub v4.8h, v4.8h, v5.8h // a-b sub v5.8h, v6.8h, v5.8h // c-b sub v22.8h, v22.8h, v23.8h // a-b sub v23.8h, v24.8h, v23.8h // c-b ssra v5.8h, v4.8h, #2 // (a-b)/4-b+c ssra v23.8h, v22.8h, #2 // (a-b)/4-b+c ssra v6.8h, v5.8h, #2 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 ssra v24.8h, v23.8h, #2 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 sqrshrun v6.8b, v6.8h, #6 ld1 {v28.16b}, [x7], #16 // src[16:31] mov v0.16b, v2.16b ext v23.16b, v7.16b, v18.16b, #15 sqrshrun2 v6.16b, v24.8h, #6 mov v1.16b, v3.16b ext v22.16b, v7.16b, v18.16b, #14 ext v24.16b, v18.16b, v28.16b, #1 ext v25.16b, v18.16b, v28.16b, #2 ext v26.16b, v18.16b, v28.16b, #3 st1 {v6.16b}, [x2], #16 b.gt 2b subs w6, w6, #1 add x10, x10, x4 add x11, x11, x4 add x12, x12, x4 add x13, x13, x4 b.gt 1b ret endfunc // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, // uint8_t *dstv, uint8_t *dstc, intptr_t src_stride, // intptr_t dst_stride, int width, int height ) function frame_init_lowres_core_neon, export=1 ldr w8, [sp] sub x10, x6, w7, uxtw // dst_stride - width and x10, x10, #~15 1: mov w9, w7 // width mov x11, x0 // src0 add x12, x0, x5 // src1 = src0 + src_stride add x13, x0, x5, lsl #1 // src2 = src1 + src_stride ld2 {v0.16b,v1.16b}, [x11], #32 ld2 {v2.16b,v3.16b}, [x12], #32 ld2 {v4.16b,v5.16b}, [x13], #32 urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x] urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x] 2: subs w9, w9, #16 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] ld2 {v0.16b,v1.16b}, [x11], #32 ld2 {v2.16b,v3.16b}, [x12], #32 ld2 {v4.16b,v5.16b}, [x13], #32 urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2] ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2] urhadd v16.16b, v20.16b, v21.16b urhadd v18.16b, v22.16b, v23.16b urhadd v17.16b, v21.16b, v24.16b urhadd v19.16b, v23.16b, v25.16b st1 {v16.16b}, [x1], #16 st1 {v18.16b}, [x3], #16 st1 {v17.16b}, [x2], #16 st1 {v19.16b}, [x4], #16 b.le 3f subs w9, w9, #16 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] ld2 {v0.16b,v1.16b}, [x11], #32 ld2 {v2.16b,v3.16b}, [x12], #32 ld2 {v4.16b,v5.16b}, [x13], #32 urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2] ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2] urhadd v16.16b, v30.16b, v21.16b urhadd v18.16b, v31.16b, v23.16b urhadd v17.16b, v21.16b, v24.16b urhadd v19.16b, v23.16b, v25.16b st1 {v16.16b}, [x1], #16 st1 {v18.16b}, [x3], #16 st1 {v17.16b}, [x2], #16 st1 {v19.16b}, [x4], #16 b.gt 2b 3: subs w8, w8, #1 add x0, x0, x5, lsl #1 add x1, x1, x10 add x2, x2, x10 add x3, x3, x10 add x4, x4, x10 b.gt 1b ret endfunc #if HAVE_I8MM ENABLE_I8MM const hpel_filter .byte 1, -5, 20, 20, -5, 1, 0, 0 endconst const hpel_permute_tbl .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 endconst // void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, // intptr_t stride, int width, int height, int16_t *buf ) function hpel_filter_neon_i8mm, export=1 movrel x14, hpel_permute_tbl ld1 {v22.16b - v24.16b}, [x14] movrel x8, hpel_filter ld1 {v28.8b}, [x8] sxtl v0.8h, v28.8b add w15, w5, #3 mov x10, x0 sub x11, x1, #2 mov x12, x2 sub x13, x3, #2 // armv8 handles unaligned loads movi v30.16b, #5 movi v31.16b, #20 1: mov x3, x13 mov x2, x12 mov x1, x11 mov x0, x10 mov x5, x15 // restore width add x7, x3, #8 // src pointer next 16b for horiz filter sub x3, x3, x4, lsl #1 // src - 2*stride add x9, x3, x15 // holds src - 2*stride + width ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] ld1 {v29.16b}, [x7], #16 // src[16:31] uaddl v1.8h, v16.8b, v21.8b umlsl v1.8h, v17.8b, v30.8b umlal v1.8h, v18.8b, v31.8b umlal v1.8h, v19.8b, v31.8b umlsl v1.8h, v20.8b, v30.8b 2: subs x5, x5, #16 sub x3, x9, x5 // src - 2*stride += 16 movi v3.16b, #0 movi v4.16b, #0 movi v5.16b, #0 movi v6.16b, #0 tbl v25.16b, {v18.16b}, v22.16b tbl v26.16b, {v18.16b}, v23.16b tbl v27.16b, {v18.16b}, v24.16b usdot v3.4s, v25.16b, v28.4b[0] usdot v3.4s, v26.16b, v28.4b[1] usdot v4.4s, v26.16b, v28.4b[0] usdot v4.4s, v27.16b, v28.4b[1] tbl v25.16b, {v29.16b}, v22.16b tbl v26.16b, {v29.16b}, v23.16b tbl v27.16b, {v29.16b}, v24.16b uzp1 v7.8h, v3.8h, v4.8h usdot v5.4s, v25.16b, v28.4b[0] usdot v5.4s, v26.16b, v28.4b[1] usdot v6.4s, v26.16b, v28.4b[0] usdot v6.4s, v27.16b, v28.4b[1] uzp1 v6.8h, v5.8h, v6.8h sqrshrun v7.8b, v7.8h, #5 sqrshrun2 v7.16b, v6.8h, #5 st1 {v7.16b}, [x0], #16 sqrshrun v6.8b, v1.8h, #5 uaddl2 v2.8h, v16.16b, v21.16b umlsl2 v2.8h, v17.16b, v30.16b umlal2 v2.8h, v18.16b, v31.16b umlal2 v2.8h, v19.16b, v31.16b umlsl2 v2.8h, v20.16b, v30.16b sqrshrun2 v6.16b, v2.8h, #5 st1 {v6.16b}, [x1], #16 ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] ld1 {v29.16b}, [x7], #16 // src[16:31] ext v3.16b, v1.16b, v2.16b, #2 ext v4.16b, v1.16b, v2.16b, #4 ext v5.16b, v1.16b, v2.16b, #6 ext v6.16b, v1.16b, v2.16b, #8 ext v7.16b, v1.16b, v2.16b, #10 add v7.8h, v1.8h, v7.8h // filter = 1 add v6.8h, v3.8h, v6.8h // filter = -5 add v5.8h, v4.8h, v5.8h // filter = 20 sub v3.8h, v7.8h, v6.8h // a-b sub v4.8h, v5.8h, v6.8h // c-b ssra v4.8h, v3.8h, #2 // (a-b)/4-b+c ssra v5.8h, v4.8h, #2 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 mov v25.16b, v5.16b uaddl v1.8h, v16.8b, v21.8b umlsl v1.8h, v17.8b, v30.8b umlal v1.8h, v18.8b, v31.8b umlal v1.8h, v19.8b, v31.8b umlsl v1.8h, v20.8b, v30.8b ext v3.16b, v2.16b, v1.16b, #2 ext v4.16b, v2.16b, v1.16b, #4 ext v5.16b, v2.16b, v1.16b, #6 ext v6.16b, v2.16b, v1.16b, #8 ext v7.16b, v2.16b, v1.16b, #10 add v7.8h, v2.8h, v7.8h // filter = 1 add v6.8h, v3.8h, v6.8h // filter = -5 add v5.8h, v4.8h, v5.8h // filter = 20 sub v3.8h, v7.8h, v6.8h // a-b sub v4.8h, v5.8h, v6.8h // c-b ssra v4.8h, v3.8h, #2 // (a-b)/4-b+c ssra v5.8h, v4.8h, #2 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 sqrshrun v27.8b, v25.8h, #6 sqrshrun2 v27.16b, v5.8h, #6 st1 {v27.16b}, [x2], #16 b.gt 2b subs w6, w6, #1 add x10, x10, x4 add x11, x11, x4 add x12, x12, x4 add x13, x13, x4 b.gt 1b ret endfunc DISABLE_I8MM #endif // HAVE_I8MM function load_deinterleave_chroma_fenc_neon, export=1 mov x4, #FENC_STRIDE/2 b load_deinterleave_chroma endfunc function load_deinterleave_chroma_fdec_neon, export=1 mov x4, #FDEC_STRIDE/2 load_deinterleave_chroma: ld2 {v0.8b,v1.8b}, [x1], x2 ld2 {v2.8b,v3.8b}, [x1], x2 subs w3, w3, #2 st1 {v0.8b}, [x0], x4 st1 {v1.8b}, [x0], x4 st1 {v2.8b}, [x0], x4 st1 {v3.8b}, [x0], x4 b.gt load_deinterleave_chroma ret endfunc function plane_copy_core_neon, export=1 add w8, w4, #15 // 32-bit write clears the upper 32-bit the register and w4, w8, #~15 // safe use of the full reg since negative width makes no sense sub x1, x1, x4 sub x3, x3, x4 1: mov w8, w4 16: tst w8, #16 b.eq 32f subs w8, w8, #16 ldr q0, [x2], #16 str q0, [x0], #16 b.eq 0f 32: subs w8, w8, #32 ldp q0, q1, [x2], #32 stp q0, q1, [x0], #32 b.gt 32b 0: subs w5, w5, #1 add x2, x2, x3 add x0, x0, x1 b.gt 1b ret endfunc function plane_copy_swap_core_neon, export=1 lsl w4, w4, #1 sub x1, x1, x4 sub x3, x3, x4 1: mov w8, w4 tbz w4, #4, 32f subs w8, w8, #16 ld1 {v0.16b}, [x2], #16 rev16 v0.16b, v0.16b st1 {v0.16b}, [x0], #16 b.eq 0f 32: subs w8, w8, #32 ld1 {v0.16b,v1.16b}, [x2], #32 rev16 v0.16b, v0.16b rev16 v1.16b, v1.16b st1 {v0.16b,v1.16b}, [x0], #32 b.gt 32b 0: subs w5, w5, #1 add x2, x2, x3 add x0, x0, x1 b.gt 1b ret endfunc function plane_copy_deinterleave_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 sub x1, x1, x9 sub x3, x3, x9 sub x5, x5, x9, lsl #1 1: ld2 {v0.16b,v1.16b}, [x4], #32 subs w9, w9, #16 st1 {v0.16b}, [x0], #16 st1 {v1.16b}, [x2], #16 b.gt 1b add x4, x4, x5 subs w7, w7, #1 add x0, x0, x1 add x2, x2, x3 mov w9, w6 b.gt 1b ret endfunc .macro deinterleave_rgb subs x11, x11, #8 st1 {v0.8b}, [x0], #8 st1 {v1.8b}, [x2], #8 st1 {v2.8b}, [x4], #8 b.gt 1b subs w10, w10, #1 add x0, x0, x1 add x2, x2, x3 add x4, x4, x5 add x6, x6, x7 mov x11, x9 b.gt 1b .endm function plane_copy_deinterleave_rgb_neon, export=1 #if SYS_MACOSX ldr w8, [sp] ldp w9, w10, [sp, #4] #else ldr x8, [sp] ldp x9, x10, [sp, #8] #endif cmp w8, #3 uxtw x9, w9 add x11, x9, #7 and x11, x11, #~7 sub x1, x1, x11 sub x3, x3, x11 sub x5, x5, x11 b.ne 4f sub x7, x7, x11, lsl #1 sub x7, x7, x11 1: ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24 deinterleave_rgb ret 4: sub x7, x7, x11, lsl #2 1: ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32 deinterleave_rgb ret endfunc function plane_copy_interleave_core_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 sub x1, x1, x9, lsl #1 sub x3, x3, x9 sub x5, x5, x9 1: ld1 {v0.16b}, [x2], #16 ld1 {v1.16b}, [x4], #16 subs w9, w9, #16 st2 {v0.16b,v1.16b}, [x0], #32 b.gt 1b subs w7, w7, #1 add x0, x0, x1 add x2, x2, x3 add x4, x4, x5 mov w9, w6 b.gt 1b ret endfunc function store_interleave_chroma_neon, export=1 mov x5, #FDEC_STRIDE 1: ld1 {v0.8b}, [x2], x5 ld1 {v1.8b}, [x3], x5 ld1 {v2.8b}, [x2], x5 ld1 {v3.8b}, [x3], x5 subs w4, w4, #2 zip1 v4.16b, v0.16b, v1.16b zip1 v5.16b, v2.16b, v3.16b st1 {v4.16b}, [x0], x1 st1 {v5.16b}, [x0], x1 b.gt 1b ret endfunc .macro integral4h p1, p2 ext v1.8b, \p1\().8b, \p2\().8b, #1 ext v2.8b, \p1\().8b, \p2\().8b, #2 ext v3.8b, \p1\().8b, \p2\().8b, #3 uaddl v0.8h, \p1\().8b, v1.8b uaddl v4.8h, v2.8b, v3.8b add v0.8h, v0.8h, v4.8h add v0.8h, v0.8h, v5.8h .endm function integral_init4h_neon, export=1 sub x3, x0, x2, lsl #1 ld1 {v6.8b,v7.8b}, [x1], #16 1: subs x2, x2, #16 ld1 {v5.8h}, [x3], #16 integral4h v6, v7 ld1 {v6.8b}, [x1], #8 ld1 {v5.8h}, [x3], #16 st1 {v0.8h}, [x0], #16 integral4h v7, v6 ld1 {v7.8b}, [x1], #8 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc .macro integral8h p1, p2, s ext v1.8b, \p1\().8b, \p2\().8b, #1 ext v2.8b, \p1\().8b, \p2\().8b, #2 ext v3.8b, \p1\().8b, \p2\().8b, #3 ext v4.8b, \p1\().8b, \p2\().8b, #4 ext v5.8b, \p1\().8b, \p2\().8b, #5 ext v6.8b, \p1\().8b, \p2\().8b, #6 ext v7.8b, \p1\().8b, \p2\().8b, #7 uaddl v0.8h, \p1\().8b, v1.8b uaddl v2.8h, v2.8b, v3.8b uaddl v4.8h, v4.8b, v5.8b uaddl v6.8h, v6.8b, v7.8b add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h add v0.8h, v0.8h, v4.8h add v0.8h, v0.8h, \s\().8h .endm function integral_init8h_neon, export=1 sub x3, x0, x2, lsl #1 ld1 {v16.8b,v17.8b}, [x1], #16 1: subs x2, x2, #16 ld1 {v18.8h}, [x3], #16 integral8h v16, v17, v18 ld1 {v16.8b}, [x1], #8 ld1 {v18.8h}, [x3], #16 st1 {v0.8h}, [x0], #16 integral8h v17, v16, v18 ld1 {v17.8b}, [x1], #8 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc function integral_init4v_neon, export=1 mov x3, x0 add x4, x0, x2, lsl #3 add x8, x0, x2, lsl #4 sub x2, x2, #8 ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48 ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48 1: subs x2, x2, #16 ld1 {v24.8h,v25.8h}, [x4], #32 ext v0.16b, v20.16b, v21.16b, #8 ext v1.16b, v21.16b, v22.16b, #8 ext v2.16b, v16.16b, v17.16b, #8 ext v3.16b, v17.16b, v18.16b, #8 sub v24.8h, v24.8h, v20.8h sub v25.8h, v25.8h, v21.8h add v0.8h, v0.8h, v20.8h add v1.8h, v1.8h, v21.8h add v2.8h, v2.8h, v16.8h add v3.8h, v3.8h, v17.8h st1 {v24.8h}, [x1], #16 st1 {v25.8h}, [x1], #16 mov v20.16b, v22.16b mov v16.16b, v18.16b sub v0.8h, v2.8h, v0.8h sub v1.8h, v3.8h, v1.8h ld1 {v21.8h,v22.8h}, [x3], #32 ld1 {v17.8h,v18.8h}, [x8], #32 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x0], #16 b.gt 1b 2: ret endfunc function integral_init8v_neon, export=1 add x2, x0, x1, lsl #4 sub x1, x1, #8 ands x3, x1, #16 - 1 b.eq 1f subs x1, x1, #8 ld1 {v0.8h}, [x0] ld1 {v2.8h}, [x2], #16 sub v4.8h, v2.8h, v0.8h st1 {v4.8h}, [x0], #16 b.le 2f 1: subs x1, x1, #16 ld1 {v0.8h,v1.8h}, [x0] ld1 {v2.8h,v3.8h}, [x2], #32 sub v4.8h, v2.8h, v0.8h sub v5.8h, v3.8h, v1.8h st1 {v4.8h}, [x0], #16 st1 {v5.8h}, [x0], #16 b.gt 1b 2: ret endfunc #else // BIT_DEPTH == 8 // void pixel_avg( pixel *dst, intptr_t dst_stride, // pixel *src1, intptr_t src1_stride, // pixel *src2, intptr_t src2_stride, int weight ); .macro AVGH w h function pixel_avg_\w\()x\h\()_neon, export=1 mov w10, #64 cmp w6, #32 mov w9, #\h b.eq pixel_avg_w\w\()_neon subs w7, w10, w6 b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 cmp w6, #0 b.ge pixel_avg_weight_w\w\()_add_add_neon b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 endfunc .endm AVGH 4, 2 AVGH 4, 4 AVGH 4, 8 AVGH 4, 16 AVGH 8, 4 AVGH 8, 8 AVGH 8, 16 AVGH 16, 8 AVGH 16, 16 // 0 < weight < 64 .macro load_weights_add_add mov w6, w6 .endm .macro weight_add_add dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s1, v30.8h umlal2 \dst, \s2, v31.8h .else umull \dst, \s1, v30.4h umlal \dst, \s2, v31.4h .endif .endm // weight > 64 .macro load_weights_add_sub neg w7, w7 .endm .macro weight_add_sub dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s1, v30.8h umlsl2 \dst, \s2, v31.8h .else umull \dst, \s1, v30.4h umlsl \dst, \s2, v31.4h .endif .endm // weight < 0 .macro load_weights_sub_add neg w6, w6 .endm .macro weight_sub_add dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s2, v31.8h umlsl2 \dst, \s1, v30.8h .else umull \dst, \s2, v31.4h umlsl \dst, \s1, v30.4h .endif .endm .macro AVG_WEIGHT ext function pixel_avg_weight_w4_\ext\()_neon load_weights_\ext dup v30.8h, w6 dup v31.8h, w7 lsl x3, x3, #1 lsl x5, x5, #1 lsl x1, x1, #1 1: // height loop subs w9, w9, #2 ld1 {v0.d}[0], [x2], x3 ld1 {v1.d}[0], [x4], x5 weight_\ext v4.4s, v0.4h, v1.4h ld1 {v2.d}[0], [x2], x3 ld1 {v3.d}[0], [x4], x5 mvni v28.8h, #0xfc, lsl #8 sqrshrun v4.4h, v4.4s, #6 weight_\ext v5.4s, v2.4h, v3.4h smin v4.4h, v4.4h, v28.4h sqrshrun v5.4h, v5.4s, #6 st1 {v4.d}[0], [x0], x1 smin v5.4h, v5.4h, v28.4h st1 {v5.d}[0], [x0], x1 b.gt 1b ret endfunc function pixel_avg_weight_w8_\ext\()_neon load_weights_\ext dup v30.8h, w6 dup v31.8h, w7 lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: // height loop subs w9, w9, #4 ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x4], x5 weight_\ext v16.4s, v0.4h, v1.4h weight_\ext v17.4s, v0.8h, v1.8h, 2 ld1 {v2.8h}, [x2], x3 ld1 {v3.8h}, [x4], x5 weight_\ext v18.4s, v2.4h, v3.4h weight_\ext v19.4s, v2.8h, v3.8h, 2 ld1 {v4.8h}, [x2], x3 ld1 {v5.8h}, [x4], x5 weight_\ext v20.4s, v4.4h, v5.4h weight_\ext v21.4s, v4.8h, v5.8h, 2 ld1 {v6.8h}, [x2], x3 ld1 {v7.8h}, [x4], x5 weight_\ext v22.4s, v6.4h, v7.4h weight_\ext v23.4s, v6.8h, v7.8h, 2 mvni v28.8h, #0xfc, lsl #8 sqrshrun v0.4h, v16.4s, #6 sqrshrun v2.4h, v18.4s, #6 sqrshrun v4.4h, v20.4s, #6 sqrshrun2 v0.8h, v17.4s, #6 sqrshrun v6.4h, v22.4s, #6 sqrshrun2 v2.8h, v19.4s, #6 sqrshrun2 v4.8h, v21.4s, #6 smin v0.8h, v0.8h, v28.8h smin v2.8h, v2.8h, v28.8h sqrshrun2 v6.8h, v23.4s, #6 smin v4.8h, v4.8h, v28.8h smin v6.8h, v6.8h, v28.8h st1 {v0.8h}, [x0], x1 st1 {v2.8h}, [x0], x1 st1 {v4.8h}, [x0], x1 st1 {v6.8h}, [x0], x1 b.gt 1b ret endfunc function pixel_avg_weight_w16_\ext\()_neon load_weights_\ext dup v30.8h, w6 dup v31.8h, w7 lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: // height loop subs w9, w9, #2 ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x4], x5 ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x4], x5 weight_\ext v16.4s, v0.4h, v2.4h weight_\ext v17.4s, v0.8h, v2.8h, 2 weight_\ext v18.4s, v1.4h, v3.4h weight_\ext v19.4s, v1.8h, v3.8h, 2 weight_\ext v20.4s, v4.4h, v6.4h weight_\ext v21.4s, v4.8h, v6.8h, 2 weight_\ext v22.4s, v5.4h, v7.4h weight_\ext v23.4s, v5.8h, v7.8h, 2 mvni v28.8h, #0xfc, lsl #8 sqrshrun v0.4h, v16.4s, #6 sqrshrun v1.4h, v18.4s, #6 sqrshrun v2.4h, v20.4s, #6 sqrshrun2 v0.8h, v17.4s, #6 sqrshrun2 v1.8h, v19.4s, #6 sqrshrun2 v2.8h, v21.4s, #6 smin v0.8h, v0.8h, v28.8h smin v1.8h, v1.8h, v28.8h sqrshrun v3.4h, v22.4s, #6 smin v2.8h, v2.8h, v28.8h sqrshrun2 v3.8h, v23.4s, #6 smin v3.8h, v3.8h, v28.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc .endm AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add function pixel_avg_w4_neon lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: subs w9, w9, #2 ld1 {v0.d}[0], [x2], x3 ld1 {v2.d}[0], [x4], x5 ld1 {v0.d}[1], [x2], x3 ld1 {v2.d}[1], [x4], x5 urhadd v0.8h, v0.8h, v2.8h st1 {v0.d}[0], [x0], x1 st1 {v0.d}[1], [x0], x1 b.gt 1b ret endfunc function pixel_avg_w8_neon lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: subs w9, w9, #4 ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x4], x5 ld1 {v2.8h}, [x2], x3 urhadd v0.8h, v0.8h, v1.8h ld1 {v3.8h}, [x4], x5 st1 {v0.8h}, [x0], x1 ld1 {v4.8h}, [x2], x3 urhadd v1.8h, v2.8h, v3.8h ld1 {v5.8h}, [x4], x5 st1 {v1.8h}, [x0], x1 ld1 {v6.8h}, [x2], x3 ld1 {v7.8h}, [x4], x5 urhadd v2.8h, v4.8h, v5.8h urhadd v3.8h, v6.8h, v7.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x0], x1 b.gt 1b ret endfunc function pixel_avg_w16_neon lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: subs w9, w9, #4 ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x4], x5 ld1 {v4.8h, v5.8h}, [x2], x3 urhadd v0.8h, v0.8h, v2.8h urhadd v1.8h, v1.8h, v3.8h ld1 {v6.8h, v7.8h}, [x4], x5 ld1 {v20.8h, v21.8h}, [x2], x3 st1 {v0.8h, v1.8h}, [x0], x1 urhadd v4.8h, v4.8h, v6.8h urhadd v5.8h, v5.8h, v7.8h ld1 {v22.8h, v23.8h}, [x4], x5 ld1 {v24.8h, v25.8h}, [x2], x3 st1 {v4.8h, v5.8h}, [x0], x1 ld1 {v26.8h, v27.8h}, [x4], x5 urhadd v20.8h, v20.8h, v22.8h urhadd v21.8h, v21.8h, v23.8h urhadd v24.8h, v24.8h, v26.8h urhadd v25.8h, v25.8h, v27.8h st1 {v20.8h, v21.8h}, [x0], x1 st1 {v24.8h, v25.8h}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w4_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 1: subs w5, w5, #2 ld1 {v0.4h}, [x2], x3 ld1 {v2.4h}, [x4], x3 ld1 {v1.4h}, [x2], x3 ld1 {v3.4h}, [x4], x3 urhadd v0.4h, v0.4h, v2.4h urhadd v1.4h, v1.4h, v3.4h st1 {v0.4h}, [x0], x1 st1 {v1.4h}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w8_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 1: subs w5, w5, #2 ld1 {v0.8h}, [x2], x3 ld1 {v2.8h}, [x4], x3 ld1 {v1.8h}, [x2], x3 ld1 {v3.8h}, [x4], x3 urhadd v0.8h, v0.8h, v2.8h urhadd v1.8h, v1.8h, v3.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w16_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 1: subs w5, w5, #2 ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x4], x3 ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x4], x3 urhadd v0.8h, v0.8h, v2.8h urhadd v1.8h, v1.8h, v3.8h urhadd v4.8h, v4.8h, v6.8h urhadd v5.8h, v5.8h, v7.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v4.8h, v5.8h}, [x0], x1 b.gt 1b ret endfunc function pixel_avg2_w20_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 sub x1, x1, #32 1: subs w5, w5, #2 ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3 ld1 {v3.8h, v4.8h, v5.8h}, [x4], x3 ld1 {v20.8h, v21.8h, v22.8h}, [x2], x3 ld1 {v23.8h, v24.8h, v25.8h}, [x4], x3 urhadd v0.8h, v0.8h, v3.8h urhadd v1.8h, v1.8h, v4.8h urhadd v2.4h, v2.4h, v5.4h urhadd v20.8h, v20.8h, v23.8h urhadd v21.8h, v21.8h, v24.8h urhadd v22.4h, v22.4h, v25.4h st1 {v0.8h, v1.8h}, [x0], #32 st1 {v2.4h}, [x0], x1 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.4h}, [x0], x1 b.gt 1b ret endfunc // void mc_copy( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int height ) function mc_copy_w4_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 1: subs w4, w4, #4 ld1 {v0.d}[0], [x2], x3 ld1 {v1.d}[0], [x2], x3 ld1 {v2.d}[0], [x2], x3 ld1 {v3.d}[0], [x2], x3 st1 {v0.d}[0], [x0], x1 st1 {v1.d}[0], [x0], x1 st1 {v2.d}[0], [x0], x1 st1 {v3.d}[0], [x0], x1 b.gt 1b ret endfunc function mc_copy_w8_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 1: subs w4, w4, #4 ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x2], x3 ld1 {v2.8h}, [x2], x3 ld1 {v3.8h}, [x2], x3 st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x0], x1 st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x0], x1 b.gt 1b ret endfunc function mc_copy_w16_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 1: subs w4, w4, #4 ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x2], x3 ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x2], x3 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x0], x1 st1 {v4.8h, v5.8h}, [x0], x1 st1 {v6.8h, v7.8h}, [x0], x1 b.gt 1b ret endfunc .macro weight_prologue type mov w9, w5 // height .ifc \type, full ldr w12, [x4, #32] // denom .endif ldp w4, w5, [x4, #32+4] // scale, offset dup v0.8h, w4 lsl w5, w5, #2 dup v1.4s, w5 .ifc \type, full neg w12, w12 dup v2.4s, w12 .endif .endm // void mc_weight( pixel *src, intptr_t src_stride, pixel *dst, // intptr_t dst_stride, const x264_weight_t *weight, int h ) function mc_weight_w20_neon, export=1 weight_prologue full lsl x3, x3, #1 lsl x1, x1, #1 sub x1, x1, #32 1: subs w9, w9, #2 ld1 {v16.8h, v17.8h, v18.8h}, [x2], x3 ld1 {v19.8h, v20.8h, v21.8h}, [x2], x3 umull v22.4s, v16.4h, v0.4h umull2 v23.4s, v16.8h, v0.8h umull v24.4s, v17.4h, v0.4h umull2 v25.4s, v17.8h, v0.8h umull v26.4s, v18.4h, v0.4h umull v27.4s, v21.4h, v0.4h srshl v22.4s, v22.4s, v2.4s srshl v23.4s, v23.4s, v2.4s srshl v24.4s, v24.4s, v2.4s srshl v25.4s, v25.4s, v2.4s srshl v26.4s, v26.4s, v2.4s srshl v27.4s, v27.4s, v2.4s add v22.4s, v22.4s, v1.4s add v23.4s, v23.4s, v1.4s add v24.4s, v24.4s, v1.4s add v25.4s, v25.4s, v1.4s add v26.4s, v26.4s, v1.4s add v27.4s, v27.4s, v1.4s sqxtun v22.4h, v22.4s sqxtun2 v22.8h, v23.4s sqxtun v23.4h, v24.4s sqxtun2 v23.8h, v25.4s sqxtun v24.4h, v26.4s sqxtun2 v24.8h, v27.4s umull v16.4s, v19.4h, v0.4h umull2 v17.4s, v19.8h, v0.8h umull v18.4s, v20.4h, v0.4h umull2 v19.4s, v20.8h, v0.8h srshl v16.4s, v16.4s, v2.4s srshl v17.4s, v17.4s, v2.4s srshl v18.4s, v18.4s, v2.4s srshl v19.4s, v19.4s, v2.4s add v16.4s, v16.4s, v1.4s add v17.4s, v17.4s, v1.4s add v18.4s, v18.4s, v1.4s add v19.4s, v19.4s, v1.4s sqxtun v16.4h, v16.4s sqxtun2 v16.8h, v17.4s sqxtun v17.4h, v18.4s sqxtun2 v17.8h, v19.4s mvni v31.8h, #0xfc, lsl #8 umin v22.8h, v22.8h, v31.8h umin v23.8h, v23.8h, v31.8h umin v24.8h, v24.8h, v31.8h umin v16.8h, v16.8h, v31.8h umin v17.8h, v17.8h, v31.8h st1 {v22.8h, v23.8h}, [x0], #32 st1 {v24.d}[0], [x0], x1 st1 {v16.8h, v17.8h}, [x0], #32 st1 {v24.d}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w16_neon, export=1 weight_prologue full lsl x1, x1, #1 lsl x3, x3, #1 1: subs w9, w9, #2 ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x2], x3 umull v22.4s, v4.4h, v0.4h umull2 v23.4s, v4.8h, v0.8h umull v24.4s, v5.4h, v0.4h umull2 v25.4s, v5.8h, v0.8h srshl v22.4s, v22.4s, v2.4s srshl v23.4s, v23.4s, v2.4s srshl v24.4s, v24.4s, v2.4s srshl v25.4s, v25.4s, v2.4s add v22.4s, v22.4s, v1.4s add v23.4s, v23.4s, v1.4s add v24.4s, v24.4s, v1.4s add v25.4s, v25.4s, v1.4s sqxtun v22.4h, v22.4s sqxtun2 v22.8h, v23.4s sqxtun v23.4h, v24.4s sqxtun2 v23.8h, v25.4s umull v26.4s, v6.4h, v0.4h umull2 v27.4s, v6.8h, v0.8h umull v28.4s, v7.4h, v0.4h umull2 v29.4s, v7.8h, v0.8h srshl v26.4s, v26.4s, v2.4s srshl v27.4s, v27.4s, v2.4s srshl v28.4s, v28.4s, v2.4s srshl v29.4s, v29.4s, v2.4s add v26.4s, v26.4s, v1.4s add v27.4s, v27.4s, v1.4s add v28.4s, v28.4s, v1.4s add v29.4s, v29.4s, v1.4s sqxtun v26.4h, v26.4s sqxtun2 v26.8h, v27.4s sqxtun v27.4h, v28.4s sqxtun2 v27.8h, v29.4s mvni v31.8h, 0xfc, lsl #8 umin v22.8h, v22.8h, v31.8h umin v23.8h, v23.8h, v31.8h umin v26.8h, v26.8h, v31.8h umin v27.8h, v27.8h, v31.8h st1 {v22.8h, v23.8h}, [x0], x1 st1 {v26.8h, v27.8h}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w8_neon, export=1 weight_prologue full lsl x3, x3, #1 lsl x1, x1, #1 1: subs w9, w9, #2 ld1 {v16.8h}, [x2], x3 ld1 {v17.8h}, [x2], x3 umull v4.4s, v16.4h, v0.4h umull2 v5.4s, v16.8h, v0.8h umull v6.4s, v17.4h, v0.4h umull2 v7.4s, v17.8h, v0.8h srshl v4.4s, v4.4s, v2.4s srshl v5.4s, v5.4s, v2.4s srshl v6.4s, v6.4s, v2.4s srshl v7.4s, v7.4s, v2.4s add v4.4s, v4.4s, v1.4s add v5.4s, v5.4s, v1.4s add v6.4s, v6.4s, v1.4s add v7.4s, v7.4s, v1.4s sqxtun v16.4h, v4.4s sqxtun2 v16.8h, v5.4s sqxtun v17.4h, v6.4s sqxtun2 v17.8h, v7.4s mvni v28.8h, #0xfc, lsl #8 umin v16.8h, v16.8h, v28.8h umin v17.8h, v17.8h, v28.8h st1 {v16.8h}, [x0], x1 st1 {v17.8h}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w4_neon, export=1 weight_prologue full lsl x3, x3, #1 lsl x1, x1, #1 1: subs w9, w9, #2 ld1 {v16.d}[0], [x2], x3 ld1 {v16.d}[1], [x2], x3 umull v4.4s, v16.4h, v0.4h umull2 v5.4s, v16.8h, v0.8h srshl v4.4s, v4.4s, v2.4s srshl v5.4s, v5.4s, v2.4s add v4.4s, v4.4s, v1.4s add v5.4s, v5.4s, v1.4s sqxtun v16.4h, v4.4s sqxtun2 v16.8h, v5.4s mvni v28.8h, #0xfc, lsl #8 umin v16.8h, v16.8h, v28.8h st1 {v16.d}[0], [x0], x1 st1 {v16.d}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w20_nodenom_neon, export=1 weight_prologue nodenom lsl x3, x3, #1 lsl x1, x1, #1 sub x1, x1, #32 1: subs w9, w9, #2 ld1 {v16.8h, v17.8h, v18.8h}, [x2], x3 mov v20.16b, v1.16b mov v21.16b, v1.16b mov v22.16b, v1.16b mov v23.16b, v1.16b mov v24.16b, v1.16b mov v25.16b, v1.16b ld1 {v2.8h, v3.8h, v4.8h}, [x2], x3 mov v26.16b, v1.16b mov v27.16b, v1.16b mov v28.16b, v1.16b mov v29.16b, v1.16b umlal v20.4s, v16.4h, v0.4h umlal2 v21.4s, v16.8h, v0.8h umlal v22.4s, v17.4h, v0.4h umlal2 v23.4s, v17.8h, v0.8h umlal v24.4s, v18.4h, v0.4h umlal v25.4s, v4.4h, v0.4h umlal v26.4s, v2.4h, v0.4h umlal2 v27.4s, v2.8h, v0.8h umlal v28.4s, v3.4h, v0.4h umlal2 v29.4s, v3.8h, v0.8h sqxtun v2.4h, v20.4s sqxtun2 v2.8h, v21.4s sqxtun v3.4h, v22.4s sqxtun2 v3.8h, v23.4s sqxtun v4.4h, v24.4s sqxtun2 v4.8h, v25.4s sqxtun v5.4h, v26.4s sqxtun2 v5.8h, v27.4s sqxtun v6.4h, v28.4s sqxtun2 v6.8h, v29.4s mvni v31.8h, 0xfc, lsl #8 umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h umin v4.8h, v4.8h, v31.8h umin v5.8h, v5.8h, v31.8h umin v6.8h, v6.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], #32 st1 {v4.d}[0], [x0], x1 st1 {v5.8h, v6.8h}, [x0], #32 st1 {v4.d}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w16_nodenom_neon, export=1 weight_prologue nodenom lsl x1, x1, #1 lsl x3, x3, #1 1: subs w9, w9, #2 ld1 {v2.8h, v3.8h}, [x2], x3 mov v27.16b, v1.16b mov v28.16b, v1.16b mov v29.16b, v1.16b mov v30.16b, v1.16b ld1 {v4.8h, v5.8h}, [x2], x3 mov v20.16b, v1.16b mov v21.16b, v1.16b mov v22.16b, v1.16b mov v23.16b, v1.16b umlal v27.4s, v2.4h, v0.4h umlal2 v28.4s, v2.8h, v0.8h umlal v29.4s, v3.4h, v0.4h umlal2 v30.4s, v3.8h, v0.8h umlal v20.4s, v4.4h, v0.4h umlal2 v21.4s, v4.8h, v0.8h umlal v22.4s, v5.4h, v0.4h umlal2 v23.4s, v5.8h, v0.8h sqxtun v2.4h, v27.4s sqxtun2 v2.8h, v28.4s sqxtun v3.4h, v29.4s sqxtun2 v3.8h, v30.4s sqxtun v4.4h, v20.4s sqxtun2 v4.8h, v21.4s sqxtun v5.4h, v22.4s sqxtun2 v5.8h, v23.4s mvni v31.8h, 0xfc, lsl #8 umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h umin v4.8h, v4.8h, v31.8h umin v5.8h, v5.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], x1 st1 {v4.8h, v5.8h}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w8_nodenom_neon, export=1 weight_prologue nodenom lsl x1, x1, #1 lsl x3, x3, #1 1: subs w9, w9, #2 ld1 {v16.8h}, [x2], x3 mov v27.16b, v1.16b ld1 {v17.8h}, [x2], x3 mov v28.16b, v1.16b mov v29.16b, v1.16b mov v30.16b, v1.16b umlal v27.4s, v16.4h, v0.4h umlal2 v28.4s, v16.8h, v0.8h umlal v29.4s, v17.4h, v0.4h umlal2 v30.4s, v17.8h, v0.8h sqxtun v4.4h, v27.4s sqxtun2 v4.8h, v28.4s sqxtun v5.4h, v29.4s sqxtun2 v5.8h, v30.4s mvni v31.8h, 0xfc, lsl #8 umin v4.8h, v4.8h, v31.8h umin v5.8h, v5.8h, v31.8h st1 {v4.8h}, [x0], x1 st1 {v5.8h}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w4_nodenom_neon, export=1 weight_prologue nodenom lsl x1, x1, #1 lsl x3, x3, #1 1: subs w9, w9, #2 ld1 {v16.d}[0], [x2], x3 ld1 {v16.d}[1], [x2], x3 mov v27.16b, v1.16b mov v28.16b, v1.16b umlal v27.4s, v16.4h, v0.4h umlal2 v28.4s, v16.8h, v0.8h sqxtun v4.4h, v27.4s sqxtun2 v4.8h, v28.4s mvni v31.8h, 0xfc, lsl #8 umin v4.8h, v4.8h, v31.8h st1 {v4.d}[0], [x0], x1 st1 {v4.d}[1], [x0], x1 b.gt 1b ret endfunc .macro weight_simple_prologue ldr w6, [x4] // offset lsl w6, w6, #2 dup v1.8h, w6 .endm .macro weight_simple name op function mc_weight_w20_\name\()_neon, export=1 weight_simple_prologue lsl x1, x1, #1 lsl x3, x3, #1 sub x1, x1, #32 1: subs w5, w5, #2 ld1 {v2.8h, v3.8h, v4.8h}, [x2], x3 ld1 {v5.8h, v6.8h, v7.8h}, [x2], x3 zip1 v4.2d, v4.2d, v7.2d \op v2.8h, v2.8h, v1.8h \op v3.8h, v3.8h, v1.8h \op v4.8h, v4.8h, v1.8h \op v5.8h, v5.8h, v1.8h \op v6.8h, v6.8h, v1.8h mvni v31.8h, #0xfc, lsl #8 umin v2.8h, v2.8h, v28.8h umin v3.8h, v3.8h, v28.8h umin v4.8h, v4.8h, v28.8h umin v5.8h, v5.8h, v28.8h umin v6.8h, v6.8h, v28.8h st1 {v2.8h, v3.8h}, [x0], #32 st1 {v4.d}[0], [x0], x1 st1 {v5.8h, v6.8h}, [x0], #32 st1 {v4.d}[1], [x0], x1 b.gt 1b ret endfunc function mc_weight_w16_\name\()_neon, export=1 weight_simple_prologue lsl x1, x1, #1 lsl x3, x3, #1 1: subs w5, w5, #2 ld1 {v16.8h, v17.8h}, [x2], x3 ld1 {v18.8h, v19.8h}, [x2], x3 \op v16.8h, v16.8h, v1.8h \op v17.8h, v17.8h, v1.8h \op v18.8h, v18.8h, v1.8h \op v19.8h, v19.8h, v1.8h mvni v28.8h, #0xfc, lsl #8 umin v16.8h, v16.8h, v28.8h umin v17.8h, v17.8h, v28.8h umin v18.8h, v18.8h, v28.8h umin v19.8h, v19.8h, v28.8h st1 {v16.8h, v17.8h}, [x0], x1 st1 {v18.8h, v19.8h}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w8_\name\()_neon, export=1 weight_simple_prologue lsl x1, x1, #1 lsl x3, x3, #1 1: subs w5, w5, #2 ld1 {v16.8h}, [x2], x3 ld1 {v17.8h}, [x2], x3 \op v16.8h, v16.8h, v1.8h \op v17.8h, v17.8h, v1.8h mvni v28.8h, 0xfc, lsl #8 umin v16.8h, v16.8h, v28.8h umin v17.8h, v17.8h, v28.8h st1 {v16.8h}, [x0], x1 st1 {v17.8h}, [x0], x1 b.gt 1b ret endfunc function mc_weight_w4_\name\()_neon, export=1 weight_simple_prologue lsl x1, x1, #1 lsl x3, x3, #1 1: subs w5, w5, #2 ld1 {v16.d}[0], [x2], x3 ld1 {v16.d}[1], [x2], x3 \op v16.8h, v16.8h, v1.8h mvni v28.8h, 0xfc, lsl #8 umin v16.8h, v16.8h, v28.8h st1 {v16.d}[0], [x0], x1 st1 {v16.d}[1], [x0], x1 b.gt 1b ret endfunc .endm weight_simple offsetadd, uqadd weight_simple offsetsub, uqsub // void mc_chroma( pixel *dst_u, pixel *dst_v, // intptr_t i_dst_stride, // pixel *src, intptr_t i_src_stride, // int dx, int dy, int i_width, int i_height ); function mc_chroma_neon, export=1 ldr w15, [sp] // height sbfx x12, x6, #3, #29 // asr(3) and sign extend sbfx x11, x5, #3, #29 // asr(3) and sign extend cmp w7, #4 lsl x4, x4, #1 mul x12, x12, x4 add x3, x3, x11, lsl #2 and w5, w5, #7 and w6, w6, #7 add x3, x3, x12 b.gt mc_chroma_w8_neon b.eq mc_chroma_w4_neon endfunc .macro CHROMA_MC_START r00, r01, r10, r11 mul w12, w5, w6 // cD = d8x *d8y lsl w13, w5, #3 add w9, w12, #64 lsl w14, w6, #3 tst w12, w12 sub w9, w9, w13 sub w10, w13, w12 // cB = d8x *(8-d8y); sub w11, w14, w12 // cC = (8-d8x)*d8y sub w9, w9, w14 // cA = (8-d8x)*(8-d8y); .endm .macro CHROMA_MC width, vsize function mc_chroma_w\width\()_neon lsl x2, x2, #1 // since the element size varies, there's a different index for the 2nd store .if \width == 4 .set idx2, 1 .else .set idx2, 2 .endif CHROMA_MC_START b.eq 2f ld2 {v28.8h, v29.8h}, [x3], x4 dup v0.8h, w9 // cA dup v1.8h, w10 // cB ext v6.16b, v28.16b, v28.16b, #2 ext v7.16b, v29.16b, v29.16b, #2 ld2 {v30.8h, v31.8h}, [x3], x4 dup v2.8h, w11 // cC dup v3.8h, w12 // cD ext v22.16b, v30.16b, v30.16b, #2 ext v23.16b, v31.16b, v31.16b, #2 trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d trn1 v4.2d, v28.2d, v6.2d trn1 v5.2d, v29.2d, v7.2d trn1 v20.2d, v30.2d, v22.2d trn1 v21.2d, v31.2d, v23.2d 1: // height loop, interpolate xy subs w15, w15, #2 mul v16.8h, v4.8h, v0.8h mul v17.8h, v5.8h, v0.8h mla v16.8h, v20.8h, v2.8h mla v17.8h, v21.8h, v2.8h ld2 {v28.8h, v29.8h}, [x3], x4 transpose v24.2d, v25.2d, v16.2d, v17.2d ext v6.16b, v28.16b, v28.16b, #2 ext v7.16b, v29.16b, v29.16b, #2 trn1 v4.2d, v28.2d, v6.2d trn1 v5.2d, v29.2d, v7.2d add v16.8h, v24.8h, v25.8h urshr v16.8h, v16.8h, #6 mul v18.8h, v20.8h, v0.8h mul v19.8h, v21.8h, v0.8h mla v18.8h, v4.8h, v2.8h mla v19.8h, v5.8h, v2.8h ld2 {v30.8h, v31.8h}, [x3], x4 transpose v26.2d, v27.2d, v18.2d, v19.2d add v18.8h, v26.8h, v27.8h urshr v18.8h, v18.8h, #6 ext v22.16b, v30.16b, v30.16b, #2 ext v23.16b, v31.16b, v31.16b, #2 trn1 v20.2d, v30.2d, v22.2d trn1 v21.2d, v31.2d, v23.2d st1 {v16.\vsize}[0], [x0], x2 st1 {v16.\vsize}[idx2], [x1], x2 st1 {v18.\vsize}[0], [x0], x2 st1 {v18.\vsize}[idx2], [x1], x2 b.gt 1b ret 2: // dx or dy are 0 tst w11, w11 add w10, w10, w11 dup v0.8h, w9 dup v1.8h, w10 b.eq 4f ld1 {v4.8h}, [x3], x4 ld1 {v6.8h}, [x3], x4 3: // vertical interpolation loop subs w15, w15, #2 mul v16.8h, v4.8h, v0.8h mla v16.8h, v6.8h, v1.8h ld1 {v4.8h}, [x3], x4 mul v17.8h, v6.8h, v0.8h mla v17.8h, v4.8h, v1.8h ld1 {v6.8h}, [x3], x4 urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 uzp1 v18.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv uzp2 v19.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv st1 {v18.\vsize}[0], [x0], x2 st1 {v18.\vsize}[idx2], [x0], x2 st1 {v19.\vsize}[0], [x1], x2 st1 {v19.\vsize}[idx2], [x1], x2 b.gt 3b ret 4: // dy is 0 ld1 {v4.8h, v5.8h}, [x3], x4 ld1 {v6.8h, v7.8h}, [x3], x4 ext v5.16b, v4.16b, v5.16b, #4 ext v7.16b, v6.16b, v7.16b, #4 5: // horizontal interpolation loop subs w15, w15, #2 mul v16.8h, v4.8h, v0.8h mla v16.8h, v5.8h, v1.8h mul v17.8h, v6.8h, v0.8h mla v17.8h, v7.8h, v1.8h ld1 {v4.8h, v5.8h}, [x3], x4 ld1 {v6.8h, v7.8h}, [x3], x4 urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 ext v5.16b, v4.16b, v5.16b, #4 ext v7.16b, v6.16b, v7.16b, #4 uzp1 v18.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv uzp2 v19.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv st1 {v18.\vsize}[0], [x0], x2 st1 {v18.\vsize}[idx2], [x0], x2 st1 {v19.\vsize}[0], [x1], x2 st1 {v19.\vsize}[idx2], [x1], x2 b.gt 5b ret endfunc .endm CHROMA_MC 2, s CHROMA_MC 4, d function mc_chroma_w8_neon lsl x2, x2, #1 CHROMA_MC_START b.eq 2f sub x4, x4, #32 ld2 {v4.8h, v5.8h}, [x3], #32 ld2 {v6.8h, v7.8h}, [x3], x4 ld2 {v20.8h, v21.8h}, [x3], #32 ld2 {v22.8h, v23.8h}, [x3], x4 dup v0.8h, w9 // cA dup v1.8h, w10 // cB ext v24.16b, v4.16b, v6.16b, #2 ext v26.16b, v6.16b, v4.16b, #2 ext v28.16b, v20.16b, v22.16b, #2 ext v30.16b, v22.16b, v20.16b, #2 ext v25.16b, v5.16b, v7.16b, #2 ext v27.16b, v7.16b, v5.16b, #2 ext v29.16b, v21.16b, v23.16b, #2 ext v31.16b, v23.16b, v21.16b, #2 dup v2.8h, w11 // cC dup v3.8h, w12 // cD 1: // height loop, interpolate xy subs w15, w15, #2 mul v16.8h, v4.8h, v0.8h mul v17.8h, v5.8h, v0.8h mla v16.8h, v24.8h, v1.8h mla v17.8h, v25.8h, v1.8h mla v16.8h, v20.8h, v2.8h mla v17.8h, v21.8h, v2.8h mla v16.8h, v28.8h, v3.8h mla v17.8h, v29.8h, v3.8h urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 st1 {v16.8h}, [x0], x2 st1 {v17.8h}, [x1], x2 ld2 {v4.8h, v5.8h}, [x3], #32 ld2 {v6.8h, v7.8h}, [x3], x4 mul v16.8h, v20.8h, v0.8h mul v17.8h, v21.8h, v0.8h ext v24.16b, v4.16b, v6.16b, #2 ext v26.16b, v6.16b, v4.16b, #2 mla v16.8h, v28.8h, v1.8h mla v17.8h, v29.8h, v1.8h ext v25.16b, v5.16b, v7.16b, #2 ext v27.16b, v7.16b, v5.16b, #2 mla v16.8h, v4.8h, v2.8h mla v17.8h, v5.8h, v2.8h mla v16.8h, v24.8h, v3.8h mla v17.8h, v25.8h, v3.8h urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 ld2 {v20.8h, v21.8h}, [x3], #32 ld2 {v22.8h, v23.8h}, [x3], x4 ext v28.16b, v20.16b, v22.16b, #2 ext v30.16b, v22.16b, v20.16b, #2 ext v29.16b, v21.16b, v23.16b, #2 ext v31.16b, v23.16b, v21.16b, #2 st1 {v16.8h}, [x0], x2 st1 {v17.8h}, [x1], x2 b.gt 1b ret 2: // dx or dy are 0 tst w11, w11 add w10, w10, w11 dup v0.8h, w9 dup v1.8h, w10 b.eq 4f ld2 {v4.8h, v5.8h}, [x3], x4 ld2 {v6.8h, v7.8h}, [x3], x4 3: // vertical interpolation loop subs w15, w15, #2 mul v16.8h, v4.8h, v0.8h mul v17.8h, v5.8h, v0.8h mla v16.8h, v6.8h, v1.8h mla v17.8h, v7.8h, v1.8h urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 st1 {v16.8h}, [x0], x2 st1 {v17.8h}, [x1], x2 ld2 {v4.8h, v5.8h}, [x3], x4 mul v16.8h, v6.8h, v0.8h mul v17.8h, v7.8h, v0.8h ld2 {v6.8h, v7.8h}, [x3], x4 mla v16.8h, v4.8h, v1.8h mla v17.8h, v5.8h, v1.8h urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 st1 {v16.8h}, [x0], x2 st1 {v17.8h}, [x1], x2 b.gt 3b ret 4: // dy is 0 sub x4, x4, #32 ld2 {v4.8h, v5.8h}, [x3], #32 ld2 {v6.8h, v7.8h}, [x3], x4 ext v24.16b, v4.16b, v6.16b, #2 ext v26.16b, v6.16b, v4.16b, #2 ld2 {v20.8h, v21.8h}, [x3], #32 ld2 {v22.8h, v23.8h}, [x3], x4 ext v28.16b, v20.16b, v22.16b, #2 ext v30.16b, v22.16b, v20.16b, #2 ext v25.16b, v5.16b, v7.16b, #2 ext v27.16b, v7.16b, v5.16b, #2 ext v29.16b, v21.16b, v23.16b, #2 ext v31.16b, v23.16b, v21.16b, #2 5: // horizontal interpolation loop subs w15, w15, #2 mul v16.8h, v4.8h, v0.8h mul v17.8h, v5.8h, v0.8h mla v16.8h, v24.8h, v1.8h mla v17.8h, v25.8h, v1.8h urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 st1 {v16.8h}, [x0], x2 st1 {v17.8h}, [x1], x2 mul v16.8h, v20.8h, v0.8h mul v17.8h, v21.8h, v0.8h ld2 {v4.8h, v5.8h}, [x3], #32 ld2 {v6.8h, v7.8h}, [x3], x4 mla v16.8h, v28.8h, v1.8h mla v17.8h, v29.8h, v1.8h ld2 {v20.8h,v21.8h}, [x3], #32 ld2 {v22.8h,v23.8h}, [x3], x4 urshr v16.8h, v16.8h, #6 urshr v17.8h, v17.8h, #6 ext v24.16b, v4.16b, v6.16b, #2 ext v26.16b, v6.16b, v4.16b, #2 ext v28.16b, v20.16b, v22.16b, #2 ext v30.16b, v22.16b, v20.16b, #2 ext v29.16b, v21.16b, v23.16b, #2 ext v31.16b, v23.16b, v21.16b, #2 ext v25.16b, v5.16b, v7.16b, #2 ext v27.16b, v7.16b, v5.16b, #2 st1 {v16.8h}, [x0], x2 st1 {v17.8h}, [x1], x2 b.gt 5b ret endfunc .macro integral4h p1, p2 ext v1.16b, \p1\().16b, \p2\().16b, #2 ext v2.16b, \p1\().16b, \p2\().16b, #4 ext v3.16b, \p1\().16b, \p2\().16b, #6 add v0.8h, \p1\().8h, v1.8h add v4.8h, v2.8h, v3.8h add v0.8h, v0.8h, v4.8h add v0.8h, v0.8h, v5.8h .endm function integral_init4h_neon, export=1 sub x3, x0, x2, lsl #1 lsl x2, x2, #1 ld1 {v6.8h,v7.8h}, [x1], #32 1: subs x2, x2, #32 ld1 {v5.8h}, [x3], #16 integral4h v6, v7 ld1 {v6.8h}, [x1], #16 ld1 {v5.8h}, [x3], #16 st1 {v0.8h}, [x0], #16 integral4h v7, v6 ld1 {v7.8h}, [x1], #16 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc .macro integral8h p1, p2, s ext v1.16b, \p1\().16b, \p2\().16b, #2 ext v2.16b, \p1\().16b, \p2\().16b, #4 ext v3.16b, \p1\().16b, \p2\().16b, #6 ext v4.16b, \p1\().16b, \p2\().16b, #8 ext v5.16b, \p1\().16b, \p2\().16b, #10 ext v6.16b, \p1\().16b, \p2\().16b, #12 ext v7.16b, \p1\().16b, \p2\().16b, #14 add v0.8h, \p1\().8h, v1.8h add v2.8h, v2.8h, v3.8h add v4.8h, v4.8h, v5.8h add v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h add v0.8h, v0.8h, v4.8h add v0.8h, v0.8h, \s\().8h .endm function integral_init8h_neon, export=1 sub x3, x0, x2, lsl #1 lsl x2, x2, #1 ld1 {v16.8h, v17.8h}, [x1], #32 1: subs x2, x2, #32 ld1 {v18.8h}, [x3], #16 integral8h v16, v17, v18 ld1 {v16.8h}, [x1], #16 ld1 {v18.8h}, [x3], #16 st1 {v0.8h}, [x0], #16 integral8h v17, v16, v18 ld1 {v17.8h}, [x1], #16 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc function integral_init4v_neon, export=1 mov x3, x0 add x4, x0, x2, lsl #3 add x8, x0, x2, lsl #4 lsl x2, x2, #1 sub x2, x2, #16 ld1 {v20.8h, v21.8h, v22.8h}, [x3], #48 ld1 {v16.8h, v17.8h, v18.8h}, [x8], #48 1: subs x2, x2, #32 ld1 {v24.8h, v25.8h}, [x4], #32 ext v0.16b, v20.16b, v21.16b, #8 ext v1.16b, v21.16b, v22.16b, #8 ext v2.16b, v16.16b, v17.16b, #8 ext v3.16b, v17.16b, v18.16b, #8 sub v24.8h, v24.8h, v20.8h sub v25.8h, v25.8h, v21.8h add v0.8h, v0.8h, v20.8h add v1.8h, v1.8h, v21.8h add v2.8h, v2.8h, v16.8h add v3.8h, v3.8h, v17.8h st1 {v24.8h}, [x1], #16 st1 {v25.8h}, [x1], #16 mov v20.16b, v22.16b mov v16.16b, v18.16b sub v0.8h, v2.8h, v0.8h sub v1.8h, v3.8h, v1.8h ld1 {v21.8h, v22.8h}, [x3], #32 ld1 {v17.8h, v18.8h}, [x8], #32 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x0], #16 b.gt 1b 2: ret endfunc function integral_init8v_neon, export=1 add x2, x0, x1, lsl #4 sub x1, x1, #8 ands x3, x1, #16 - 1 b.eq 1f subs x1, x1, #8 ld1 {v0.8h}, [x0] ld1 {v2.8h}, [x2], #16 sub v4.8h, v2.8h, v0.8h st1 {v4.8h}, [x0], #16 b.le 2f 1: subs x1, x1, #16 ld1 {v0.8h,v1.8h}, [x0] ld1 {v2.8h,v3.8h}, [x2], #32 sub v4.8h, v2.8h, v0.8h sub v5.8h, v3.8h, v1.8h st1 {v4.8h}, [x0], #16 st1 {v5.8h}, [x0], #16 b.gt 1b 2: ret endfunc // frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, // pixel *dstv, pixel *dstc, intptr_t src_stride, // intptr_t dst_stride, int width, int height ) function frame_init_lowres_core_neon, export=1 ldr w8, [sp] lsl x5, x5, #1 sub x10, x6, w7, uxtw // dst_stride - width lsl x10, x10, #1 and x10, x10, #~31 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] 1: mov w9, w7 // width mov x11, x0 // src0 add x12, x0, x5 // src1 = src0 + src_stride add x13, x0, x5, lsl #1 // src2 = src1 + src_stride ld2 {v0.8h, v1.8h}, [x11], #32 ld2 {v2.8h, v3.8h}, [x11], #32 ld2 {v4.8h, v5.8h}, [x12], #32 ld2 {v6.8h, v7.8h}, [x12], #32 ld2 {v28.8h, v29.8h}, [x13], #32 ld2 {v30.8h, v31.8h}, [x13], #32 urhadd v20.8h, v0.8h, v4.8h urhadd v21.8h, v2.8h, v6.8h urhadd v22.8h, v4.8h, v28.8h urhadd v23.8h, v6.8h, v30.8h 2: subs w9, w9, #16 urhadd v24.8h, v1.8h, v5.8h urhadd v25.8h, v3.8h, v7.8h urhadd v26.8h, v5.8h, v29.8h urhadd v27.8h, v7.8h, v31.8h ld2 {v0.8h, v1.8h}, [x11], #32 ld2 {v2.8h, v3.8h}, [x11], #32 ld2 {v4.8h, v5.8h}, [x12], #32 ld2 {v6.8h, v7.8h}, [x12], #32 ld2 {v28.8h, v29.8h}, [x13], #32 ld2 {v30.8h, v31.8h}, [x13], #32 urhadd v16.8h, v0.8h, v4.8h urhadd v17.8h, v2.8h, v6.8h urhadd v18.8h, v4.8h, v28.8h urhadd v19.8h, v6.8h, v30.8h ext v8.16b, v20.16b, v21.16b, #2 ext v9.16b, v21.16b, v16.16b, #2 ext v10.16b, v22.16b, v23.16b, #2 ext v11.16b, v23.16b, v18.16b, #2 urhadd v12.8h, v20.8h, v24.8h urhadd v8.8h, v24.8h, v8.8h urhadd v24.8h, v21.8h, v25.8h urhadd v22.8h, v22.8h, v26.8h urhadd v10.8h, v26.8h, v10.8h urhadd v26.8h, v23.8h, v27.8h urhadd v9.8h, v25.8h, v9.8h urhadd v11.8h, v27.8h, v11.8h st1 {v12.8h}, [x1], #16 st1 {v24.8h}, [x1], #16 st1 {v22.8h}, [x3], #16 st1 {v26.8h}, [x3], #16 st1 {v8.8h, v9.8h}, [x2], #32 st1 {v10.8h, v11.8h}, [x4], #32 b.le 3f subs w9, w9, #16 urhadd v24.8h, v1.8h, v5.8h urhadd v25.8h, v3.8h, v7.8h urhadd v26.8h, v5.8h, v29.8h urhadd v27.8h, v7.8h, v31.8h ld2 {v0.8h, v1.8h}, [x11], #32 ld2 {v2.8h, v3.8h}, [x11], #32 ld2 {v4.8h, v5.8h}, [x12], #32 ld2 {v6.8h, v7.8h}, [x12], #32 ld2 {v28.8h, v29.8h}, [x13], #32 ld2 {v30.8h, v31.8h}, [x13], #32 urhadd v20.8h, v0.8h, v4.8h urhadd v21.8h, v2.8h, v6.8h urhadd v22.8h, v4.8h, v28.8h urhadd v23.8h, v6.8h, v30.8h ext v8.16b, v16.16b, v17.16b, #2 ext v9.16b, v17.16b, v20.16b, #2 ext v10.16b, v18.16b, v19.16b, #2 ext v11.16b, v19.16b, v22.16b, #2 urhadd v12.8h, v16.8h, v24.8h urhadd v13.8h, v17.8h, v25.8h urhadd v14.8h, v18.8h, v26.8h urhadd v15.8h, v19.8h, v27.8h urhadd v16.8h, v24.8h, v8.8h urhadd v17.8h, v25.8h, v9.8h urhadd v18.8h, v26.8h, v10.8h urhadd v19.8h, v27.8h, v11.8h st1 {v12.8h, v13.8h}, [x1], #32 st1 {v14.8h, v15.8h}, [x3], #32 st1 {v16.8h, v17.8h}, [x2], #32 st1 {v18.8h, v19.8h}, [x4], #32 b.gt 2b 3: subs w8, w8, #1 add x0, x0, x5, lsl #1 add x1, x1, x10 add x2, x2, x10 add x3, x3, x10 add x4, x4, x10 b.gt 1b ldp d8, d9, [sp] ldp d10, d11, [sp, #0x10] ldp d12, d13, [sp, #0x20] ldp d14, d15, [sp, #0x30] add sp, sp, #0x40 ret endfunc function load_deinterleave_chroma_fenc_neon, export=1 mov x4, #FENC_STRIDE/2 lsl x4, x4, #1 lsl x2, x2, #1 b load_deinterleave_chroma endfunc function load_deinterleave_chroma_fdec_neon, export=1 mov x4, #FDEC_STRIDE/2 lsl x4, x4, #1 lsl x2, x2, #1 load_deinterleave_chroma: ld2 {v0.8h, v1.8h}, [x1], x2 ld2 {v2.8h, v3.8h}, [x1], x2 subs w3, w3, #2 st1 {v0.8h}, [x0], x4 st1 {v1.8h}, [x0], x4 st1 {v2.8h}, [x0], x4 st1 {v3.8h}, [x0], x4 b.gt load_deinterleave_chroma ret endfunc function store_interleave_chroma_neon, export=1 mov x5, #FDEC_STRIDE lsl x5, x5, #1 lsl x1, x1, #1 1: ld1 {v0.8h}, [x2], x5 ld1 {v1.8h}, [x3], x5 ld1 {v2.8h}, [x2], x5 ld1 {v3.8h}, [x3], x5 subs w4, w4, #2 zip1 v4.8h, v0.8h, v1.8h zip1 v6.8h, v2.8h, v3.8h zip2 v5.8h, v0.8h, v1.8h zip2 v7.8h, v2.8h, v3.8h st1 {v4.8h, v5.8h}, [x0], x1 st1 {v6.8h, v7.8h}, [x0], x1 b.gt 1b ret endfunc function plane_copy_core_neon, export=1 add w8, w4, #31 // 32-bit write clears the upper 32-bit the register and w4, w8, #~31 // safe use of the full reg since negative width makes no sense sub x1, x1, x4 sub x3, x3, x4 lsl x1, x1, #1 lsl x3, x3, #1 1: mov w8, w4 16: tst w8, #16 b.eq 32f subs w8, w8, #16 ldp q0, q1, [x2], #32 stp q0, q1, [x0], #32 b.eq 0f 32: subs w8, w8, #32 ldp q0, q1, [x2], #32 ldp q2, q3, [x2], #32 stp q0, q1, [x0], #32 stp q2, q3, [x0], #32 b.gt 32b 0: subs w5, w5, #1 add x2, x2, x3 add x0, x0, x1 b.gt 1b ret endfunc function plane_copy_swap_core_neon, export=1 lsl w4, w4, #1 add w8, w4, #31 // 32-bit write clears the upper 32-bit the register and w4, w8, #~31 sub x1, x1, x4 sub x3, x3, x4 lsl x1, x1, #1 lsl x3, x3, #1 1: mov w8, w4 tbz w4, #4, 32f subs w8, w8, #16 ld1 {v0.8h, v1.8h}, [x2], #32 rev32 v0.8h, v0.8h rev32 v1.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.eq 0f 32: subs w8, w8, #32 ld1 {v0.8h ,v1.8h, v2.8h, v3.8h}, [x2], #64 rev32 v20.8h, v0.8h rev32 v21.8h, v1.8h rev32 v22.8h, v2.8h rev32 v23.8h, v3.8h st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 b.gt 32b 0: subs w5, w5, #1 add x2, x2, x3 add x0, x0, x1 b.gt 1b ret endfunc function plane_copy_deinterleave_neon, export=1 add w9, w6, #15 and w9, w9, #~15 sub x1, x1, x9 sub x3, x3, x9 sub x5, x5, x9, lsl #1 lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: ld2 {v0.8h, v1.8h}, [x4], #32 ld2 {v2.8h, v3.8h}, [x4], #32 subs w9, w9, #16 st1 {v0.8h}, [x0], #16 st1 {v2.8h}, [x0], #16 st1 {v1.8h}, [x2], #16 st1 {v3.8h}, [x2], #16 b.gt 1b add x4, x4, x5 subs w7, w7, #1 add x0, x0, x1 add x2, x2, x3 mov w9, w6 b.gt 1b ret endfunc function plane_copy_interleave_core_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 sub x1, x1, x9, lsl #1 sub x3, x3, x9 sub x5, x5, x9 lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 1: ld1 {v0.8h}, [x2], #16 ld1 {v1.8h}, [x4], #16 ld1 {v2.8h}, [x2], #16 ld1 {v3.8h}, [x4], #16 subs w9, w9, #16 st2 {v0.8h, v1.8h}, [x0], #32 st2 {v2.8h, v3.8h}, [x0], #32 b.gt 1b subs w7, w7, #1 add x0, x0, x1 add x2, x2, x3 add x4, x4, x5 mov w9, w6 b.gt 1b ret endfunc .macro deinterleave_rgb subs x11, x11, #8 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x2], #16 st1 {v2.8h}, [x4], #16 b.gt 1b subs w10, w10, #1 add x0, x0, x1 add x2, x2, x3 add x4, x4, x5 add x6, x6, x7 mov x11, x9 b.gt 1b .endm function plane_copy_deinterleave_rgb_neon, export=1 #if SYS_MACOSX ldr w8, [sp] ldp w9, w10, [sp, #4] #else ldr x8, [sp] ldp x9, x10, [sp, #8] #endif cmp w8, #3 uxtw x9, w9 add x11, x9, #7 and x11, x11, #~7 sub x1, x1, x11 sub x3, x3, x11 sub x5, x5, x11 lsl x1, x1, #1 lsl x3, x3, #1 lsl x5, x5, #1 b.ne 4f sub x7, x7, x11, lsl #1 sub x7, x7, x11 lsl x7, x7, #1 1: ld3 {v0.8h, v1.8h, v2.8h}, [x6], #48 deinterleave_rgb ret 4: sub x7, x7, x11, lsl #2 lsl x7, x7, #1 1: ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 deinterleave_rgb ret endfunc // void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, // intptr_t stride, int width, int height, int16_t *buf ) function hpel_filter_neon, export=1 lsl x5, x5, #1 ubfm x9, x3, #3, #7 add w15, w5, w9 sub x13, x3, x9 // align src sub x10, x0, x9 sub x11, x1, x9 sub x12, x2, x9 movi v30.8h, #5 movi v31.8h, #20 lsl x4, x4, #1 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] str q0, [sp, #-0x50]! 1: // line start mov x3, x13 mov x2, x12 mov x1, x11 mov x0, x10 add x7, x3, #32 // src pointer next 16b for horiz filter mov x5, x15 // restore width sub x3, x3, x4, lsl #1 // src - 2*stride ld1 {v28.8h, v29.8h}, [x7], #32 // src[16:31] add x9, x3, x5 // holds src - 2*stride + width ld1 {v8.8h, v9.8h}, [x3], x4 // src-2*stride[0:15] ld1 {v10.8h, v11.8h}, [x3], x4 // src-1*stride[0:15] ld1 {v12.8h, v13.8h}, [x3], x4 // src-0*stride[0:15] ld1 {v14.8h, v15.8h}, [x3], x4 // src+1*stride[0:15] ld1 {v16.8h, v17.8h}, [x3], x4 // src+2*stride[0:15] ld1 {v18.8h, v19.8h}, [x3], x4 // src+3*stride[0:15] ext v22.16b, v7.16b, v12.16b, #12 ext v23.16b, v12.16b, v13.16b, #12 uaddl v1.4s, v8.4h, v18.4h uaddl2 v20.4s, v8.8h, v18.8h ext v24.16b, v12.16b, v13.16b, #6 ext v25.16b, v13.16b, v28.16b, #6 umlsl v1.4s, v10.4h, v30.4h umlsl2 v20.4s, v10.8h, v30.8h ext v26.16b, v7.16b, v12.16b, #14 ext v27.16b, v12.16b, v13.16b, #14 umlal v1.4s, v12.4h, v31.4h umlal2 v20.4s, v12.8h, v31.8h ext v3.16b, v12.16b, v13.16b, #2 ext v4.16b, v13.16b, v28.16b, #2 umlal v1.4s, v14.4h, v31.4h umlal2 v20.4s, v14.8h, v31.8h ext v21.16b, v12.16b, v13.16b, #4 ext v5.16b, v13.16b, v28.16b, #4 umlsl v1.4s, v16.4h, v30.4h umlsl2 v20.4s, v16.8h, v30.8h 2: // next 16 pixel of line subs x5, x5, #32 sub x3, x9, x5 // src - 2*stride += 16 uaddl v8.4s, v22.4h, v24.4h uaddl2 v22.4s, v22.8h, v24.8h uaddl v10.4s, v23.4h, v25.4h uaddl2 v23.4s, v23.8h, v25.8h umlsl v8.4s, v26.4h, v30.4h umlsl2 v22.4s, v26.8h, v30.8h umlsl v10.4s, v27.4h, v30.4h umlsl2 v23.4s, v27.8h, v30.8h umlal v8.4s, v12.4h, v31.4h umlal2 v22.4s, v12.8h, v31.8h umlal v10.4s, v13.4h, v31.4h umlal2 v23.4s, v13.8h, v31.8h umlal v8.4s, v3.4h, v31.4h umlal2 v22.4s, v3.8h, v31.8h umlal v10.4s, v4.4h, v31.4h umlal2 v23.4s, v4.8h, v31.8h umlsl v8.4s, v21.4h, v30.4h umlsl2 v22.4s, v21.8h, v30.8h umlsl v10.4s, v5.4h, v30.4h umlsl2 v23.4s, v5.8h, v30.8h uaddl v5.4s, v9.4h, v19.4h uaddl2 v2.4s, v9.8h, v19.8h sqrshrun v8.4h, v8.4s, #5 sqrshrun2 v8.8h, v22.4s, #5 sqrshrun v10.4h, v10.4s, #5 sqrshrun2 v10.8h, v23.4s, #5 mov v6.16b, v12.16b mov v7.16b, v13.16b mvni v23.8h, #0xfc, lsl #8 umin v8.8h, v8.8h, v23.8h umin v10.8h, v10.8h, v23.8h st1 {v8.8h}, [x0], #16 st1 {v10.8h}, [x0], #16 umlsl v5.4s, v11.4h, v30.4h umlsl2 v2.4s, v11.8h, v30.8h ld1 {v8.8h, v9.8h}, [x3], x4 umlal v5.4s, v13.4h, v31.4h umlal2 v2.4s, v13.8h, v31.8h ld1 {v10.8h, v11.8h}, [x3], x4 umlal v5.4s, v15.4h, v31.4h umlal2 v2.4s, v15.8h, v31.8h ld1 {v12.8h, v13.8h}, [x3], x4 umlsl v5.4s, v17.4h, v30.4h umlsl2 v2.4s, v17.8h, v30.8h ld1 {v14.8h, v15.8h}, [x3], x4 sqrshrun v4.4h, v5.4s, #5 sqrshrun2 v4.8h, v2.4s, #5 sqrshrun v18.4h, v1.4s, #5 sqrshrun2 v18.8h, v20.4s, #5 mvni v17.8h, #0xfc, lsl #8 smin v4.8h, v4.8h, v17.8h smin v18.8h, v18.8h, v17.8h st1 {v18.8h}, [x1], #16 st1 {v4.8h}, [x1], #16 ld1 {v16.8h, v17.8h}, [x3], x4 // src+2*stride[0:15] ld1 {v18.8h, v19.8h}, [x3], x4 // src+3*stride[0:15] str q9, [sp, #0x10] str q15, [sp, #0x20] str q17, [sp, #0x30] str q19, [sp, #0x40] ldr q28, [sp] ext v22.16b, v28.16b, v1.16b, #8 ext v9.16b, v1.16b, v20.16b, #8 ext v26.16b, v1.16b, v20.16b, #12 ext v17.16b, v20.16b, v5.16b, #12 ext v23.16b, v28.16b, v1.16b, #12 ext v19.16b, v1.16b, v20.16b, #12 uaddl v3.4s, v8.4h, v18.4h uaddl2 v15.4s, v8.8h, v18.8h umlsl v3.4s, v10.4h, v30.4h umlsl2 v15.4s, v10.8h, v30.8h umlal v3.4s, v12.4h, v31.4h umlal2 v15.4s, v12.8h, v31.8h umlal v3.4s, v14.4h, v31.4h umlal2 v15.4s, v14.8h, v31.8h umlsl v3.4s, v16.4h, v30.4h umlsl2 v15.4s, v16.8h, v30.8h add v4.4s, v22.4s, v26.4s add v26.4s, v9.4s, v17.4s ext v25.16b, v1.16b, v20.16b, #8 ext v22.16b, v20.16b, v5.16b, #8 ext v24.16b, v1.16b, v20.16b, #4 ext v9.16b, v20.16b, v5.16b, #4 add v31.4s, v23.4s, v25.4s add v19.4s, v19.4s, v22.4s add v6.4s, v24.4s, v1.4s add v17.4s, v9.4s, v20.4s sub v4.4s, v4.4s, v31.4s // a-b sub v26.4s, v26.4s, v19.4s // a-b sub v31.4s, v31.4s, v6.4s // b-c sub v19.4s, v19.4s, v17.4s // b-c ext v22.16b, v20.16b, v5.16b, #8 ext v9.16b, v5.16b, v2.16b, #8 ext v24.16b, v5.16b, v2.16b, #12 ext v28.16b, v2.16b, v3.16b, #12 ext v23.16b, v20.16b, v5.16b, #12 ext v30.16b, v5.16b, v2.16b, #12 ext v25.16b, v5.16b, v2.16b, #8 ext v29.16b, v2.16b, v3.16b, #8 add v22.4s, v22.4s, v24.4s add v9.4s, v9.4s, v28.4s add v23.4s, v23.4s, v25.4s add v29.4s, v29.4s, v30.4s ext v24.16b, v5.16b, v2.16b, #4 ext v28.16b, v2.16b, v3.16b, #4 add v24.4s, v24.4s, v5.4s add v28.4s, v28.4s, v2.4s sub v22.4s, v22.4s, v23.4s sub v9.4s, v9.4s, v29.4s sub v23.4s, v23.4s, v24.4s sub v29.4s, v29.4s, v28.4s sshr v4.4s, v4.4s, #2 sshr v0.4s, v26.4s, #2 sshr v22.4s, v22.4s, #2 sshr v9.4s, v9.4s, #2 sub v4.4s, v4.4s, v31.4s sub v0.4s, v0.4s, v19.4s sub v22.4s, v22.4s, v23.4s sub v9.4s, v9.4s, v29.4s sshr v4.4s, v4.4s, #2 sshr v0.4s, v0.4s, #2 sshr v22.4s, v22.4s, #2 sshr v9.4s, v9.4s, #2 add v4.4s, v4.4s, v6.4s add v0.4s, v0.4s, v17.4s add v22.4s, v22.4s, v24.4s add v9.4s, v9.4s, v28.4s str q2, [sp] sqrshrun v4.4h, v4.4s, #6 sqrshrun2 v4.8h, v0.4s, #6 sqrshrun v22.4h, v22.4s, #6 sqrshrun2 v22.8h, v9.4s, #6 mov v0.16b, v5.16b ld1 {v28.8h, v29.8h}, [x7], #32 // src[16:31] ldr q9, [sp, #0x10] ldr q17, [sp, #0x30] ldr q19, [sp, #0x40] ext v26.16b, v7.16b, v12.16b, #14 ext v27.16b, v12.16b, v13.16b, #14 mvni v25.8h, 0xfc, lsl #8 smin v22.8h, v22.8h, v25.8h smin v4.8h, v4.8h, v25.8h st1 {v4.8h}, [x2], #16 st1 {v22.8h}, [x2], #16 mov v1.16b, v3.16b mov v20.16b, v15.16b ldr q15, [sp, #0x20] ext v22.16b, v7.16b, v12.16b, #12 ext v23.16b, v12.16b, v13.16b, #12 ext v3.16b, v12.16b, v13.16b, #2 ext v4.16b, v13.16b, v28.16b, #2 ext v21.16b, v12.16b, v13.16b, #4 ext v5.16b, v13.16b, v28.16b, #4 ext v24.16b, v12.16b, v13.16b, #6 ext v25.16b, v13.16b, v28.16b, #6 movi v30.8h, #5 movi v31.8h, #20 b.gt 2b subs w6, w6, #1 add x10, x10, x4 add x11, x11, x4 add x12, x12, x4 add x13, x13, x4 b.gt 1b add sp, sp, #0x50 ldp d8, d9, [sp] ldp d10, d11, [sp, #0x10] ldp d12, d13, [sp, #0x20] ldp d14, d15, [sp, #0x30] add sp, sp, #0x40 ret endfunc #endif x264-master/common/aarch64/mc-c.c000066400000000000000000000431321502133446700165650ustar00rootroot00000000000000/***************************************************************************** * mc-c.c: aarch64 motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "mc.h" #define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64) void x264_prefetch_ref_aarch64( pixel *, intptr_t, int ); #define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64) void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64) void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon) void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); #define x264_memzero_aligned_neon x264_template(memzero_aligned_neon) void x264_memzero_aligned_neon( void *dst, size_t n ); #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon) void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon) void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon) void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon) void x264_pixel_avg_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon) void x264_pixel_avg_8x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon) void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon) void x264_pixel_avg_4x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon) void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon) void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve) void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve) void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve) void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve) void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon) void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon) void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon) void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon) void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon) void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon) void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon) void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon) void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); #define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon) void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon) void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); #define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon) void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon) void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon) #define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon) #define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon) #define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon) #define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon) #define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon) #define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon) #define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon) #define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon) #define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon) #define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon) #define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon) #define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon) #define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon) #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon) #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon) #define MC_WEIGHT(func)\ void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ \ static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w8##func##_neon,\ x264_mc_weight_w16##func##_neon,\ x264_mc_weight_w16##func##_neon,\ x264_mc_weight_w20##func##_neon,\ }; MC_WEIGHT() MC_WEIGHT(_nodenom) MC_WEIGHT(_offsetadd) MC_WEIGHT(_offsetsub) #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon) void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon) void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon) void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_chroma_neon x264_template(mc_chroma_neon) void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int ); #define x264_integral_init4h_neon x264_template(integral_init4h_neon) void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t ); #define x264_integral_init4v_neon x264_template(integral_init4v_neon) void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t ); #define x264_integral_init8h_neon x264_template(integral_init8h_neon) void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t ); #define x264_integral_init8v_neon x264_template(integral_init8v_neon) void x264_integral_init8v_neon( uint16_t *, intptr_t ); #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon) void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int ); #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon) void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); #define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon) void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count ); #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon) void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count ); static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) = { NULL, x264_pixel_avg2_w4_neon, x264_pixel_avg2_w8_neon, x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function x264_pixel_avg2_w16_neon, x264_pixel_avg2_w20_neon, }; static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) = { NULL, x264_mc_copy_w4_neon, x264_mc_copy_w8_neon, NULL, x264_mc_copy_w16_neon, }; static void weight_cache_neon( x264_t *h, x264_weight_t *w ) { if( w->i_scale == 1<i_denom ) { if( w->i_offset < 0 ) { w->weightfn = mc_offsetsub_wtab_neon; w->cachea[0] = -w->i_offset; } else { w->weightfn = mc_offsetadd_wtab_neon; w->cachea[0] = w->i_offset; } } else if( !w->i_denom ) w->weightfn = mc_nodenom_wtab_neon; else w->weightfn = mc_wtab_neon; } static void mc_luma_neon( pixel *dst, intptr_t i_dst_stride, pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if( (mvy&3) == 3 ) // explicit if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height ); } else if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height ); else mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); } static pixel *get_ref_neon( pixel *dst, intptr_t *i_dst_stride, pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if( (mvy&3) == 3 ) // explicit if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg_wtab_neon[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); return dst; } else if( weight->weightfn ) { weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height ); return dst; } else { *i_dst_stride = i_src_stride; return src1; } } #define x264_hpel_filter_neon x264_template(hpel_filter_neon) void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, intptr_t stride, int width, int height, int16_t *buf ); #if !HIGH_BIT_DEPTH && HAVE_I8MM #define x264_hpel_filter_neon_i8mm x264_template(hpel_filter_neon_i8mm) void x264_hpel_filter_neon_i8mm( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, intptr_t stride, int width, int height, int16_t *buf ); #endif // !HIGH_BIT_DEPTH && HAVE_I8MM PLANE_COPY(16, neon) PLANE_COPY_SWAP(16, neon) PLANE_INTERLEAVE(neon) PROPAGATE_LIST(neon) void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf ) { if( cpu&X264_CPU_ARMV8 ) { pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64; pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64; pf->prefetch_ref = x264_prefetch_ref_aarch64; } if( cpu&X264_CPU_NEON ) { pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; pf->mbtree_propagate_list = mbtree_propagate_list_neon; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; pf->memcpy_aligned = x264_memcpy_aligned_neon; pf->memzero_aligned = x264_memzero_aligned_neon; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; pf->weight = mc_wtab_neon; pf->offsetadd = mc_offsetadd_wtab_neon; pf->offsetsub = mc_offsetsub_wtab_neon; pf->weight_cache = weight_cache_neon; pf->mc_chroma = x264_mc_chroma_neon; pf->mc_luma = mc_luma_neon; pf->get_ref = get_ref_neon; pf->integral_init4h = x264_integral_init4h_neon; pf->integral_init8h = x264_integral_init8h_neon; pf->integral_init4v = x264_integral_init4v_neon; pf->integral_init8v = x264_integral_init8v_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; pf->store_interleave_chroma = x264_store_interleave_chroma_neon; pf->plane_copy = plane_copy_neon; pf->plane_copy_swap = plane_copy_swap_neon; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = plane_copy_interleave_neon; pf->hpel_filter = x264_hpel_filter_neon; } #if !HIGH_BIT_DEPTH #if HAVE_SVE if( cpu&X264_CPU_SVE ) { pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve; } #endif #if HAVE_I8MM if( cpu&X264_CPU_I8MM ) { pf->hpel_filter = x264_hpel_filter_neon_i8mm; } #endif // HAVE_I8MM #endif // !HIGH_BIT_DEPTH } x264-master/common/aarch64/mc.h000066400000000000000000000026021502133446700163470ustar00rootroot00000000000000/***************************************************************************** * mc.h: aarch64 motion compensation ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_MC_H #define X264_AARCH64_MC_H #define x264_mc_init_aarch64 x264_template(mc_init_aarch64) void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf ); #endif x264-master/common/aarch64/pixel-a-common.S000066400000000000000000000034451502133446700205560ustar00rootroot00000000000000/**************************************************************************** * pixel-a-common.S: aarch64 pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ // This file contains the NEON macros and constants that are intended to be used by // the SVE/SVE2 functions as well const mask_ac_4_8 .short 0, -1, -1, -1, 0, -1, -1, -1 .short 0, -1, -1, -1, -1, -1, -1, -1 endconst .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d SUMSUB_AB \s1, \d1, \a, \b SUMSUB_AB \s2, \d2, \c, \d .endm .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 .endm x264-master/common/aarch64/pixel-a-sve.S000066400000000000000000000357471502133446700200750ustar00rootroot00000000000000/***************************************************************************** * pixel-a-sve.S: aarch64 pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "pixel-a-common.S" ENABLE_SVE #if BIT_DEPTH == 8 .macro SSD_START_SVE_4 ptrue p0.h, vl4 ld1b {z16.h}, p0/z, [x0] ld1b {z17.h}, p0/z, [x2] add x0, x0, x1 add x2, x2, x3 sub v2.4h, v16.4h, v17.4h ld1b {z16.h}, p0/z, [x0] ld1b {z17.h}, p0/z, [x2] add x0, x0, x1 add x2, x2, x3 smull v0.4s, v2.4h, v2.4h .endm .macro SSD_SVE_4 sub v2.4h, v16.4h, v17.4h ld1b {z16.h}, p0/z, [x0] ld1b {z17.h}, p0/z, [x2] add x0, x0, x1 add x2, x2, x3 smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_END_SVE_4 sub v2.4h, v16.4h, v17.4h smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_START_SVE_8 ptrue p0.h, vl8 ld1b {z16.h}, p0/z, [x0] ld1b {z17.h}, p0/z, [x2] add x0, x0, x1 add x2, x2, x3 sub v2.8h, v16.8h, v17.8h ld1b {z16.h}, p0/z, [x0] smull v0.4s, v2.4h, v2.4h ld1b {z17.h}, p0/z, [x2] smlal2 v0.4s, v2.8h, v2.8h add x0, x0, x1 add x2, x2, x3 .endm .macro SSD_SVE_8 sub v2.8h, v16.8h, v17.8h ld1b {z16.h}, p0/z, [x0] smlal v0.4s, v2.4h, v2.4h ld1b {z17.h}, p0/z, [x2] smlal2 v0.4s, v2.8h, v2.8h add x0, x0, x1 add x2, x2, x3 .endm .macro SSD_END_SVE_8 sub v2.8h, v16.8h, v17.8h smlal v0.4s, v2.4h, v2.4h smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_FUNC_SVE w h function pixel_ssd_\w\()x\h\()_sve, export=1 SSD_START_SVE_\w .rept \h-2 SSD_SVE_\w .endr SSD_END_SVE_\w addv s0, v0.4s mov w0, v0.s[0] ret endfunc .endm .macro load_diff_fly_sve_8x8 ld1b {z1.h}, p0/z, [x2] ld1b {z0.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 ld1b {z3.h}, p0/z, [x2] ld1b {z2.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 sub v16.8h, v0.8h, v1.8h sub v17.8h, v2.8h, v3.8h ld1b {z5.h}, p0/z, [x2] ld1b {z4.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 ld1b {z7.h}, p0/z, [x2] ld1b {z6.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 sub v18.8h, v4.8h, v5.8h sub v19.8h, v6.8h, v7.8h ld1b {z1.h}, p0/z, [x2] ld1b {z0.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 ld1b {z3.h}, p0/z, [x2] ld1b {z2.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 sub v20.8h, v0.8h, v1.8h sub v21.8h, v2.8h, v3.8h ld1b {z5.h}, p0/z, [x2] ld1b {z4.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 ld1b {z7.h}, p0/z, [x2] ld1b {z6.h}, p0/z, [x0] add x2, x2, x3 add x0, x0, x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h sub v22.8h, v4.8h, v5.8h sub v23.8h, v6.8h, v7.8h .endm .macro pixel_var_sve_8 h function pixel_var_8x\h\()_sve, export=1 ptrue p0.h, vl8 ld1b {z16.h}, p0/z, [x0] add x0, x0, x1 ld1b {z17.h}, p0/z, [x0] add x0, x0, x1 mov x2, \h - 4 mul v1.8h, v16.8h, v16.8h mul v2.8h, v17.8h, v17.8h add v0.8h, v16.8h, v17.8h ld1b {z18.h}, p0/z, [x0] add x0, x0, x1 uaddlp v1.4s, v1.8h uaddlp v2.4s, v2.8h ld1b {z19.h}, p0/z, [x0] add x0, x0, x1 1: subs x2, x2, #4 add v0.8h, v0.8h, v18.8h mul v24.8h, v18.8h, v18.8h ld1b {z20.h}, p0/z, [x0] add x0, x0, x1 add v0.8h, v0.8h, v19.8h mul v25.8h, v19.8h, v19.8h uadalp v1.4s, v24.8h ld1b {z21.h}, p0/z, [x0] add x0, x0, x1 add v0.8h, v0.8h, v20.8h mul v26.8h, v20.8h, v20.8h uadalp v2.4s, v25.8h ld1b {z18.h}, p0/z, [x0] add x0, x0, x1 add v0.8h, v0.8h, v21.8h mul v27.8h, v21.8h, v21.8h uadalp v1.4s, v26.8h ld1b {z19.h}, p0/z, [x0] add x0, x0, x1 uadalp v2.4s, v27.8h b.gt 1b add v0.8h, v0.8h, v18.8h mul v28.8h, v18.8h, v18.8h add v0.8h, v0.8h, v19.8h mul v29.8h, v19.8h, v19.8h uadalp v1.4s, v28.8h uadalp v2.4s, v29.8h b var_end endfunc .endm function var_end add v1.4s, v1.4s, v2.4s uaddlv s0, v0.8h uaddlv d1, v1.4s mov w0, v0.s[0] mov x1, v1.d[0] orr x0, x0, x1, lsl #32 ret endfunc .macro SUMSUBL_AB_SVE sum, sub, a, b add \sum, \a, \b sub \sub, \a, \b .endm function pixel_sa8d_8x8_sve, export=1 ptrue p0.h, vl8 mov x4, x30 bl pixel_sa8d_8x8_sve add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc .macro sa8d_satd_sve_8x8 satd= function pixel_sa8d_\satd\()8x8_sve load_diff_fly_sve_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h .ifc \satd, satd_ transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h transpose v4.4s, v6.4s, v24.4s, v26.4s transpose v5.4s, v7.4s, v25.4s, v27.4s transpose v24.4s, v26.4s, v0.4s, v2.4s transpose v25.4s, v27.4s, v1.4s, v3.4s abs v0.8h, v4.8h abs v1.8h, v5.8h abs v2.8h, v6.8h abs v3.8h, v7.8h abs v4.8h, v24.8h abs v5.8h, v25.8h abs v6.8h, v26.8h abs v7.8h, v27.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h add v26.8h, v0.8h, v1.8h add v27.8h, v2.8h, v3.8h .endif SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h transpose v20.8h, v21.8h, v16.8h, v17.8h transpose v4.8h, v5.8h, v0.8h, v1.8h transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h transpose v20.4s, v22.4s, v2.4s, v0.4s transpose v21.4s, v23.4s, v3.4s, v1.4s transpose v16.4s, v18.4s, v24.4s, v4.4s transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d transpose v18.2d, v22.2d, v2.2d, v6.2d transpose v19.2d, v23.2d, v3.2d, v7.2d abs v16.8h, v16.8h abs v20.8h, v20.8h abs v17.8h, v17.8h abs v21.8h, v21.8h abs v18.8h, v18.8h abs v22.8h, v22.8h abs v19.8h, v19.8h abs v23.8h, v23.8h umax v16.8h, v16.8h, v20.8h umax v17.8h, v17.8h, v21.8h umax v18.8h, v18.8h, v22.8h umax v19.8h, v19.8h, v23.8h add v0.8h, v16.8h, v17.8h add v1.8h, v18.8h, v19.8h ret endfunc .endm .macro HADAMARD_AC_SVE w h function pixel_hadamard_ac_\w\()x\h\()_sve, export=1 ptrue p0.h, vl8 movrel x5, mask_ac_4_8 mov x4, x30 ld1 {v30.8h,v31.8h}, [x5] movi v28.16b, #0 movi v29.16b, #0 bl hadamard_ac_8x8_sve .if \h > 8 bl hadamard_ac_8x8_sve .endif .if \w > 8 sub x0, x0, x1, lsl #3 add x0, x0, #8 bl hadamard_ac_8x8_sve .endif .if \w * \h == 256 sub x0, x0, x1, lsl #4 bl hadamard_ac_8x8_sve .endif addv s1, v29.4s addv s0, v28.4s mov w1, v1.s[0] mov w0, v0.s[0] lsr w1, w1, #2 lsr w0, w0, #1 orr x0, x0, x1, lsl #32 ret x4 endfunc .endm // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 function hadamard_ac_8x8_sve ld1b {z16.h}, p0/z, [x0] add x0, x0, x1 ld1b {z17.h}, p0/z, [x0] add x0, x0, x1 ld1b {z18.h}, p0/z, [x0] add x0, x0, x1 ld1b {z19.h}, p0/z, [x0] add x0, x0, x1 SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h ld1b {z20.h}, p0/z, [x0] add x0, x0, x1 ld1b {z21.h}, p0/z, [x0] add x0, x0, x1 SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h ld1b {z22.h}, p0/z, [x0] add x0, x0, x1 ld1b {z23.h}, p0/z, [x0] add x0, x0, x1 SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h abs v0.8h, v16.8h abs v4.8h, v20.8h abs v1.8h, v17.8h abs v5.8h, v21.8h abs v2.8h, v18.8h abs v6.8h, v22.8h abs v3.8h, v19.8h abs v7.8h, v23.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h and v0.16b, v0.16b, v30.16b add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h uadalp v28.4s, v0.8h uadalp v28.4s, v1.8h SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h transpose v16.2d, v17.2d, v6.2d, v7.2d transpose v18.2d, v19.2d, v4.2d, v5.2d transpose v20.2d, v21.2d, v2.2d, v3.2d abs v16.8h, v16.8h abs v17.8h, v17.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v20.8h, v20.8h abs v21.8h, v21.8h transpose v7.2d, v6.2d, v1.2d, v0.2d umax v3.8h, v16.8h, v17.8h umax v2.8h, v18.8h, v19.8h umax v1.8h, v20.8h, v21.8h SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h add v2.8h, v2.8h, v3.8h add v2.8h, v2.8h, v1.8h and v4.16b, v4.16b, v31.16b add v2.8h, v2.8h, v2.8h abs v5.8h, v5.8h abs v4.8h, v4.8h add v2.8h, v2.8h, v5.8h add v2.8h, v2.8h, v4.8h uadalp v29.4s, v2.8h ret endfunc SSD_FUNC_SVE 4, 4 SSD_FUNC_SVE 4, 8 SSD_FUNC_SVE 4, 16 SSD_FUNC_SVE 8, 4 SSD_FUNC_SVE 8, 8 pixel_var_sve_8 8 pixel_var_sve_8 16 sa8d_satd_sve_8x8 HADAMARD_AC_SVE 8, 8 HADAMARD_AC_SVE 8, 16 HADAMARD_AC_SVE 16, 8 HADAMARD_AC_SVE 16, 16 #else /* BIT_DEPTH == 10 */ .macro SSD_START_SVE_4 ptrue p0.s, vl4 ld1h {z16.s}, p0/z, [x0] ld1h {z17.s}, p0/z, [x2] add x0, x0, x1, lsl #1 add x2, x2, x3, lsl #1 sub v2.4s, v16.4s, v17.4s ld1h {z16.s}, p0/z, [x0] ld1h {z17.s}, p0/z, [x2] add x0, x0, x1, lsl #1 add x2, x2, x3, lsl #1 mul v0.4s, v2.4s, v2.4s .endm .macro SSD_SVE_4 sub v2.4s, v16.4s, v17.4s ld1h {z16.s}, p0/z, [x0] ld1h {z17.s}, p0/z, [x2] add x0, x0, x1, lsl #1 add x2, x2, x3, lsl #1 mla v0.4s, v2.4s, v2.4s .endm .macro SSD_END_SVE_4 sub v2.4s, v16.4s, v17.4s mla v0.4s, v2.4s, v2.4s .endm .macro SSD_FUNC_SVE w h function pixel_ssd_\w\()x\h\()_sve, export=1 SSD_START_SVE_\w .rept \h-2 SSD_SVE_\w .endr SSD_END_SVE_\w addv s0, v0.4s fmov w0, s0 ret endfunc .endm SSD_FUNC_SVE 4, 4 SSD_FUNC_SVE 4, 8 SSD_FUNC_SVE 4, 16 #endif /* BIT_DEPTH == 8 */ x264-master/common/aarch64/pixel-a.S000066400000000000000000002520111502133446700172630ustar00rootroot00000000000000/***************************************************************************** * pixel.S: aarch64 pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" #include "pixel-a-common.S" const mask .rept 16 .byte 0xff .endr .rept 16 .byte 0x00 .endr endconst .macro SUMSUBL_AB sum, sub, a, b uaddl \sum, \a, \b usubl \sub, \a, \b .endm #if BIT_DEPTH == 8 .macro SAD_START_4 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 uabdl v16.8h, v0.8b, v1.8b .endm .macro SAD_4 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 uabal v16.8h, v0.8b, v1.8b .endm .macro SAD_START_8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 uabdl v16.8h, v0.8b, v1.8b uabdl v17.8h, v2.8b, v3.8b .endm .macro SAD_8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 uabal v16.8h, v0.8b, v1.8b uabal v17.8h, v2.8b, v3.8b .endm .macro SAD_START_16, dotprod=0 ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 .if \dotprod == 0 uabdl v16.8h, v0.8b, v1.8b uabdl2 v17.8h, v0.16b, v1.16b uabal v16.8h, v2.8b, v3.8b uabal2 v17.8h, v2.16b, v3.16b .else movi v18.4s, #0x0 movi v19.16b, #0x1 uabd v16.16b, v0.16b, v1.16b uabd v17.16b, v2.16b, v3.16b udot v18.4s, v16.16b, v19.16b udot v18.4s, v17.16b, v19.16b .endif .endm .macro SAD_16, dotprod=0 ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 .if \dotprod == 0 uabal v16.8h, v0.8b, v1.8b uabal2 v17.8h, v0.16b, v1.16b uabal v16.8h, v2.8b, v3.8b uabal2 v17.8h, v2.16b, v3.16b .else uabd v16.16b, v0.16b, v1.16b uabd v17.16b, v2.16b, v3.16b udot v18.4s, v16.16b, v19.16b udot v18.4s, v17.16b, v19.16b .endif .endm .macro SAD_FUNC w, h, name function pixel_sad\name\()_\w\()x\h\()_neon, export=1 SAD_START_\w .rept \h / 2 - 1 SAD_\w .endr .if \w > 4 add v16.8h, v16.8h, v17.8h .endif uaddlv s0, v16.8h fmov w0, s0 ret endfunc .endm .macro SAD_FUNC_DOTPROD w, h, name function pixel_sad\name\()_\w\()x\h\()_neon_dotprod, export=1 SAD_START_\w 1 .rept \h / 2 - 1 SAD_\w 1 .endr addv s0, v18.4s fmov w0, s0 ret endfunc .endm .macro SAD_X_4 x, first=uabal ld1 {v0.s}[0], [x0], x7 ld1 {v1.s}[0], [x1], x5 ld1 {v0.s}[1], [x0], x7 ld1 {v1.s}[1], [x1], x5 ld1 {v2.s}[0], [x2], x5 ld1 {v2.s}[1], [x2], x5 \first v16.8h, v1.8b, v0.8b ld1 {v3.s}[0], [x3], x5 ld1 {v3.s}[1], [x3], x5 \first v17.8h, v2.8b, v0.8b .if \x == 4 ld1 {v4.s}[0], [x4], x5 ld1 {v4.s}[1], [x4], x5 .endif \first v18.8h, v3.8b, v0.8b .if \x == 4 \first v19.8h, v4.8b, v0.8b .endif .endm .macro SAD_X_8 x, first=uabal ld1 {v0.8b}, [x0], x7 ld1 {v1.8b}, [x1], x5 ld1 {v2.8b}, [x2], x5 \first v16.8h, v1.8b, v0.8b ld1 {v3.8b}, [x3], x5 \first v17.8h, v2.8b, v0.8b ld1 {v5.8b}, [x0], x7 ld1 {v1.8b}, [x1], x5 \first v18.8h, v3.8b, v0.8b ld1 {v2.8b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b ld1 {v3.8b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b .if \x == 4 ld1 {v4.8b}, [x4], x5 ld1 {v1.8b}, [x4], x5 .endif uabal v18.8h, v3.8b, v5.8b .if \x == 4 \first v19.8h, v4.8b, v0.8b uabal v19.8h, v1.8b, v5.8b .endif .endm .macro SAD_X_16 x, first=uabal ld1 {v0.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 ld1 {v2.16b}, [x2], x5 \first v16.8h, v1.8b, v0.8b \first\()2 v20.8h, v1.16b, v0.16b ld1 {v3.16b}, [x3], x5 \first v17.8h, v2.8b, v0.8b \first\()2 v21.8h, v2.16b, v0.16b ld1 {v5.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 \first v18.8h, v3.8b, v0.8b \first\()2 v22.8h, v3.16b, v0.16b ld1 {v2.16b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b uabal2 v20.8h, v1.16b, v5.16b ld1 {v3.16b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b uabal2 v21.8h, v2.16b, v5.16b .if \x == 4 ld1 {v4.16b}, [x4], x5 ld1 {v1.16b}, [x4], x5 .endif uabal v18.8h, v3.8b, v5.8b uabal2 v22.8h, v3.16b, v5.16b .if \x == 4 \first v19.8h, v4.8b, v0.8b \first\()2 v23.8h, v4.16b, v0.16b uabal v19.8h, v1.8b, v5.8b uabal2 v23.8h, v1.16b, v5.16b .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE SAD_X_\w \x, uabdl .rept \h / 2 - 1 SAD_X_\w \x .endr .if \w > 8 add v16.8h, v16.8h, v20.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h .if \x == 4 add v19.8h, v19.8h, v23.8h .endif .endif // add up the sads uaddlv s0, v16.8h uaddlv s1, v17.8h uaddlv s2, v18.8h stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else uaddlv s3, v19.8h stp s2, s3, [x6] .endif ret endfunc .endm .macro SAD_X_DOTPROD_16 x ld1 {v0.16b}, [x0], x7 ld1 {v1.16b}, [x1], x5 ld1 {v2.16b}, [x2], x5 uabd v20.16b, v1.16b, v0.16b uabd v22.16b, v2.16b, v0.16b ld1 {v5.16b}, [x0], x7 udot v16.4s, v20.16b, v28.16b udot v17.4s, v22.16b, v28.16b ld1 {v3.16b}, [x3], x5 ld1 {v1.16b}, [x1], x5 uabd v24.16b, v3.16b, v0.16b uabd v21.16b, v1.16b, v5.16b ld1 {v2.16b}, [x2], x5 ld1 {v3.16b}, [x3], x5 udot v18.4s, v24.16b, v28.16b udot v16.4s, v21.16b, v28.16b uabd v23.16b, v2.16b, v5.16b uabd v25.16b, v3.16b, v5.16b udot v17.4s, v23.16b, v28.16b udot v18.4s, v25.16b, v28.16b .if \x == 4 ld1 {v4.16b}, [x4], x5 ld1 {v1.16b}, [x4], x5 uabd v26.16b, v4.16b, v0.16b uabd v27.16b, v1.16b, v5.16b udot v19.4s, v26.16b, v28.16b udot v19.4s, v27.16b, v28.16b .endif .endm .macro SAD_X_DOTPROD_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon_dotprod, export=1 movi v16.4s, #0x0 movi v17.4s, #0x0 movi v18.4s, #0x0 .if \x == 4 movi v19.4s, #0x0 .endif movi v28.16b, #0x1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE SAD_X_DOTPROD_\w \x .rept \h / 2 - 1 SAD_X_DOTPROD_\w \x .endr addv s0, v16.4s addv s1, v17.4s addv s2, v18.4s .if \x == 4 addv s3, v19.4s .endif stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else stp s2, s3, [x6] .endif ret endfunc .endm function pixel_vsad_neon, export=1 subs w2, w2, #2 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 uabdl v6.8h, v0.8b, v1.8b uabdl2 v7.8h, v0.16b, v1.16b b.le 2f 1: subs w2, w2, #2 ld1 {v0.16b}, [x0], x1 uabal v6.8h, v1.8b, v0.8b uabal2 v7.8h, v1.16b, v0.16b ld1 {v1.16b}, [x0], x1 b.lt 2f uabal v6.8h, v0.8b, v1.8b uabal2 v7.8h, v0.16b, v1.16b b.gt 1b 2: add v5.8h, v6.8h, v7.8h uaddlv s0, v5.8h fmov w0, s0 ret endfunc #if HAVE_DOTPROD ENABLE_DOTPROD function pixel_vsad_neon_dotprod, export=1 ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 subs w2, w2, #2 movi v3.16b, #0x1 movi v6.4s, #0x0 uabd v5.16b, v0.16b, v1.16b udot v6.4s, v5.16b, v3.16b b.le 2f 1: ld1 {v0.16b}, [x0], x1 subs w2, w2, #2 uabd v5.16b, v0.16b, v1.16b ld1 {v1.16b}, [x0], x1 udot v6.4s, v5.16b, v3.16b b.lt 2f uabd v5.16b, v0.16b, v1.16b udot v6.4s, v5.16b, v3.16b b.gt 1b 2: addv s0, v6.4s fmov w0, s0 ret endfunc DISABLE_DOTPROD #endif // HAVE_DOTPROD function pixel_asd8_neon, export=1 sub w4, w4, #2 ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 usubl v16.8h, v0.8b, v1.8b 1: subs w4, w4, #2 ld1 {v4.8b}, [x0], x1 ld1 {v5.8b}, [x2], x3 usubl v17.8h, v2.8b, v3.8b usubl v18.8h, v4.8b, v5.8b add v16.8h, v16.8h, v17.8h ld1 {v2.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 add v16.8h, v16.8h, v18.8h b.gt 1b usubl v17.8h, v2.8b, v3.8b add v16.8h, v16.8h, v17.8h saddlv s0, v16.8h abs v0.2s, v0.2s fmov w0, s0 ret endfunc .macro SSD_START_4 ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 usubl v2.8h, v16.8b, v17.8b ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 smull v0.4s, v2.4h, v2.4h .endm .macro SSD_4 usubl v2.8h, v16.8b, v17.8b ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_END_4 usubl v2.8h, v16.8b, v17.8b smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_START_8 ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x2], x3 usubl v2.8h, v16.8b, v17.8b ld1 {v16.8b}, [x0], x1 smull v0.4s, v2.4h, v2.4h ld1 {v17.8b}, [x2], x3 smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_8 usubl v2.8h, v16.8b, v17.8b ld1 {v16.8b}, [x0], x1 smlal v0.4s, v2.4h, v2.4h ld1 {v17.8b}, [x2], x3 smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_END_8 usubl v2.8h, v16.8b, v17.8b smlal v0.4s, v2.4h, v2.4h smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_START_16 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x2], x3 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b ld1 {v16.16b}, [x0], x1 smull v0.4s, v2.4h, v2.4h smull2 v1.4s, v2.8h, v2.8h ld1 {v17.16b}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v1.4s, v3.8h, v3.8h .endm .macro SSD_16 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b ld1 {v16.16b}, [x0], x1 smlal v0.4s, v2.4h, v2.4h smlal2 v1.4s, v2.8h, v2.8h ld1 {v17.16b}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v1.4s, v3.8h, v3.8h .endm .macro SSD_END_16 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b smlal v0.4s, v2.4h, v2.4h smlal2 v1.4s, v2.8h, v2.8h smlal v0.4s, v3.4h, v3.4h smlal2 v1.4s, v3.8h, v3.8h add v0.4s, v0.4s, v1.4s .endm .macro SSD_FUNC w h function pixel_ssd_\w\()x\h\()_neon, export=1 SSD_START_\w .rept \h-2 SSD_\w .endr SSD_END_\w addv s0, v0.4s mov w0, v0.s[0] ret endfunc .endm .macro SSD_DOTPROD_8 ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x2], x3 ld1 {v18.8b}, [x0], x1 uabd v20.8b, v16.8b, v17.8b ld1 {v19.8b}, [x2], x3 uabd v21.8b, v18.8b, v19.8b udot v22.2s, v20.8b, v20.8b udot v22.2s, v21.8b, v21.8b .endm .macro SSD_DOTPROD_16 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x2], x3 ld1 {v18.16b}, [x0], x1 uabd v20.16b, v16.16b, v17.16b ld1 {v19.16b}, [x2], x3 uabd v21.16b, v18.16b, v19.16b udot v22.4s, v20.16b, v20.16b udot v22.4s, v21.16b, v21.16b .endm .macro SSD_DOTPROD_FUNC w h function pixel_ssd_\w\()x\h\()_neon_dotprod, export=1 movi v22.4s, #0x0 .rept \h/2 SSD_DOTPROD_\w .endr .if \w > 8 addv s0, v22.4s .else addp v0.2s, v22.2s, v22.2s .endif mov w0, v0.s[0] ret endfunc .endm function pixel_satd_4x4_neon, export=1 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 usubl v0.8h, v0.8b, v1.8b usubl v1.8h, v2.8b, v3.8b SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h zip1 v0.2d, v2.2d, v3.2d zip2 v1.2d, v2.2d, v3.2d SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.8h, v2.8h, v3.8h trn2 v1.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.4s, v2.4s, v3.4s trn2 v1.4s, v2.4s, v3.4s abs v0.8h, v0.8h abs v1.8h, v1.8h umax v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret endfunc function pixel_satd_4x8_neon, export=1 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v5.s}[0], [x2], x3 ld1 {v4.s}[0], [x0], x1 ld1 {v7.s}[0], [x2], x3 ld1 {v6.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 ld1 {v5.s}[1], [x2], x3 ld1 {v4.s}[1], [x0], x1 ld1 {v7.s}[1], [x2], x3 ld1 {v6.s}[1], [x0], x1 b satd_4x8_8x4_end_neon endfunc function pixel_satd_8x4_neon, export=1 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 ld1 {v5.8b}, [x2], x3 ld1 {v4.8b}, [x0], x1 ld1 {v7.8b}, [x2], x3 ld1 {v6.8b}, [x0], x1 endfunc function satd_4x8_8x4_end_neon usubl v0.8h, v0.8b, v1.8b usubl v1.8h, v2.8b, v3.8b usubl v2.8h, v4.8b, v5.8b usubl v3.8h, v6.8b, v7.8b SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h trn1 v0.8h, v4.8h, v5.8h trn2 v1.8h, v4.8h, v5.8h trn1 v2.8h, v6.8h, v7.8h trn2 v3.8h, v6.8h, v7.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h trn1 v0.4s, v16.4s, v18.4s trn2 v1.4s, v16.4s, v18.4s trn1 v2.4s, v17.4s, v19.4s trn2 v3.4s, v17.4s, v19.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h umax v0.8h, v0.8h, v1.8h umax v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret endfunc function pixel_satd_4x16_neon, export=1 mov x4, x30 ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v5.s}[0], [x2], x3 ld1 {v4.s}[0], [x0], x1 ld1 {v7.s}[0], [x2], x3 ld1 {v6.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 ld1 {v5.s}[1], [x2], x3 ld1 {v4.s}[1], [x0], x1 ld1 {v7.s}[1], [x2], x3 ld1 {v6.s}[1], [x0], x1 usubl v16.8h, v0.8b, v1.8b usubl v17.8h, v2.8b, v3.8b usubl v18.8h, v4.8b, v5.8b usubl v19.8h, v6.8b, v7.8b ld1 {v1.s}[0], [x2], x3 ld1 {v0.s}[0], [x0], x1 ld1 {v3.s}[0], [x2], x3 ld1 {v2.s}[0], [x0], x1 ld1 {v5.s}[0], [x2], x3 ld1 {v4.s}[0], [x0], x1 ld1 {v7.s}[0], [x2], x3 ld1 {v6.s}[0], [x0], x1 ld1 {v1.s}[1], [x2], x3 ld1 {v0.s}[1], [x0], x1 ld1 {v3.s}[1], [x2], x3 ld1 {v2.s}[1], [x0], x1 ld1 {v5.s}[1], [x2], x3 ld1 {v4.s}[1], [x0], x1 ld1 {v7.s}[1], [x2], x3 ld1 {v6.s}[1], [x0], x1 usubl v20.8h, v0.8b, v1.8b usubl v21.8h, v2.8b, v3.8b usubl v22.8h, v4.8b, v5.8b usubl v23.8h, v6.8b, v7.8b SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h bl satd_8x4v_8x8h_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc .macro load_diff_fly_8x8 ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 usubl v16.8h, v0.8b, v1.8b ld1 {v5.8b}, [x2], x3 ld1 {v4.8b}, [x0], x1 usubl v17.8h, v2.8b, v3.8b ld1 {v7.8b}, [x2], x3 ld1 {v6.8b}, [x0], x1 usubl v18.8h, v4.8b, v5.8b ld1 {v1.8b}, [x2], x3 ld1 {v0.8b}, [x0], x1 usubl v19.8h, v6.8b, v7.8b ld1 {v3.8b}, [x2], x3 ld1 {v2.8b}, [x0], x1 usubl v20.8h, v0.8b, v1.8b ld1 {v5.8b}, [x2], x3 ld1 {v4.8b}, [x0], x1 usubl v21.8h, v2.8b, v3.8b ld1 {v7.8b}, [x2], x3 ld1 {v6.8b}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h usubl v22.8h, v4.8b, v5.8b usubl v23.8h, v6.8b, v7.8b .endm function pixel_satd_8x8_neon, export=1 mov x4, x30 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_8x16_neon, export=1 mov x4, x30 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v0.8h, v1.8h bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v31.8h, v0.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function satd_8x8_neon load_diff_fly_8x8 endfunc // one vertical hadamard pass and two horizontal function satd_8x4v_8x8h_neon SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h abs v4.8h, v4.8h abs v5.8h, v5.8h abs v6.8h, v6.8h abs v7.8h, v7.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h ret endfunc function pixel_ssd_nv12_core_neon, export=1 sxtw x8, w4 add x8, x8, #8 and x8, x8, #~15 movi v6.2d, #0 movi v7.2d, #0 sub x1, x1, x8, lsl #1 sub x3, x3, x8, lsl #1 1: subs w8, w4, #16 ld2 {v0.8b,v1.8b}, [x0], #16 ld2 {v2.8b,v3.8b}, [x2], #16 ld2 {v24.8b,v25.8b}, [x0], #16 ld2 {v26.8b,v27.8b}, [x2], #16 usubl v16.8h, v0.8b, v2.8b usubl v17.8h, v1.8b, v3.8b smull v20.4s, v16.4h, v16.4h smull v21.4s, v17.4h, v17.4h usubl v18.8h, v24.8b, v26.8b usubl v19.8h, v25.8b, v27.8b smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f b.eq 3f 2: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h ld2 {v0.8b,v1.8b}, [x0], #16 ld2 {v2.8b,v3.8b}, [x2], #16 smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h subs w8, w8, #16 usubl v16.8h, v0.8b, v2.8b usubl v17.8h, v1.8b, v3.8b smlal v20.4s, v16.4h, v16.4h smlal v21.4s, v17.4h, v17.4h ld2 {v24.8b,v25.8b}, [x0], #16 ld2 {v26.8b,v27.8b}, [x2], #16 smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f usubl v18.8h, v24.8b, v26.8b usubl v19.8h, v25.8b, v27.8b b.gt 2b 3: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h 4: subs w5, w5, #1 uaddw v6.2d, v6.2d, v20.2s uaddw v7.2d, v7.2d, v21.2s add x0, x0, x1 add x2, x2, x3 uaddw2 v6.2d, v6.2d, v20.4s uaddw2 v7.2d, v7.2d, v21.4s b.gt 1b addp v6.2d, v6.2d, v7.2d st1 {v6.d}[0], [x6] st1 {v6.d}[1], [x7] ret endfunc .macro pixel_var_8 h function pixel_var_8x\h\()_neon, export=1 ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x0], x1 mov x2, \h - 4 umull v1.8h, v16.8b, v16.8b uxtl v0.8h, v16.8b umull v2.8h, v17.8b, v17.8b uaddw v0.8h, v0.8h, v17.8b ld1 {v18.8b}, [x0], x1 uaddlp v1.4s, v1.8h uaddlp v2.4s, v2.8h ld1 {v19.8b}, [x0], x1 1: subs x2, x2, #4 uaddw v0.8h, v0.8h, v18.8b umull v24.8h, v18.8b, v18.8b ld1 {v20.8b}, [x0], x1 uaddw v0.8h, v0.8h, v19.8b umull v25.8h, v19.8b, v19.8b uadalp v1.4s, v24.8h ld1 {v21.8b}, [x0], x1 uaddw v0.8h, v0.8h, v20.8b umull v26.8h, v20.8b, v20.8b uadalp v2.4s, v25.8h ld1 {v18.8b}, [x0], x1 uaddw v0.8h, v0.8h, v21.8b umull v27.8h, v21.8b, v21.8b uadalp v1.4s, v26.8h ld1 {v19.8b}, [x0], x1 uadalp v2.4s, v27.8h b.gt 1b uaddw v0.8h, v0.8h, v18.8b umull v28.8h, v18.8b, v18.8b uaddw v0.8h, v0.8h, v19.8b umull v29.8h, v19.8b, v19.8b uadalp v1.4s, v28.8h uadalp v2.4s, v29.8h b var_end endfunc .endm function pixel_var_16x16_neon, export=1 ld1 {v16.16b}, [x0], x1 ld1 {v17.16b}, [x0], x1 mov x2, #14 umull v1.8h, v16.8b, v16.8b umull2 v2.8h, v16.16b, v16.16b uxtl v0.8h, v16.8b uaddlp v1.4s, v1.8h uaddlp v2.4s, v2.8h uaddw2 v0.8h, v0.8h, v16.16b 1: subs x2, x2, #2 ld1 {v18.16b}, [x0], x1 uaddw v0.8h, v0.8h, v17.8b umull v3.8h, v17.8b, v17.8b uaddw2 v0.8h, v0.8h, v17.16b umull2 v4.8h, v17.16b, v17.16b uadalp v1.4s, v3.8h uadalp v2.4s, v4.8h ld1 {v17.16b}, [x0], x1 uaddw v0.8h, v0.8h, v18.8b umull v5.8h, v18.8b, v18.8b uaddw2 v0.8h, v0.8h, v18.16b umull2 v6.8h, v18.16b, v18.16b uadalp v1.4s, v5.8h uadalp v2.4s, v6.8h b.gt 1b uaddw v0.8h, v0.8h, v17.8b umull v3.8h, v17.8b, v17.8b uaddw2 v0.8h, v0.8h, v17.16b umull2 v4.8h, v17.16b, v17.16b uadalp v1.4s, v3.8h uadalp v2.4s, v4.8h endfunc function var_end add v1.4s, v1.4s, v2.4s uaddlv s0, v0.8h uaddlv d1, v1.4s mov w0, v0.s[0] mov x1, v1.d[0] orr x0, x0, x1, lsl #32 ret endfunc .macro pixel_var2_8 h function pixel_var2_8x\h\()_neon, export=1 mov x3, #16 ld1 {v16.8b}, [x0], #8 ld1 {v18.8b}, [x1], x3 ld1 {v17.8b}, [x0], #8 ld1 {v19.8b}, [x1], x3 mov x5, \h - 2 usubl v0.8h, v16.8b, v18.8b usubl v1.8h, v17.8b, v19.8b ld1 {v16.8b}, [x0], #8 ld1 {v18.8b}, [x1], x3 smull v2.4s, v0.4h, v0.4h smull2 v3.4s, v0.8h, v0.8h smull v4.4s, v1.4h, v1.4h smull2 v5.4s, v1.8h, v1.8h usubl v6.8h, v16.8b, v18.8b 1: subs x5, x5, #1 ld1 {v17.8b}, [x0], #8 ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h ld1 {v16.8b}, [x0], #8 ld1 {v18.8b}, [x1], x3 smlal v4.4s, v7.4h, v7.4h smlal2 v5.4s, v7.8h, v7.8h usubl v6.8h, v16.8b, v18.8b add v1.8h, v1.8h, v7.8h b.gt 1b ld1 {v17.8b}, [x0], #8 ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h smlal v4.4s, v7.4h, v7.4h add v1.8h, v1.8h, v7.8h smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] mov w1, v1.s[0] addv s2, v2.4s addv s4, v4.4s mul w0, w0, w0 mul w1, w1, w1 mov w3, v2.s[0] mov w4, v4.s[0] sub w0, w3, w0, lsr # 6 + (\h >> 4) sub w1, w4, w1, lsr # 6 + (\h >> 4) str w3, [x2] add w0, w0, w1 str w4, [x2, #4] ret endfunc .endm function pixel_satd_16x8_neon, export=1 mov x4, x30 bl satd_16x4_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_16x16_neon, export=1 mov x4, x30 bl satd_16x4_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function satd_16x4_neon ld1 {v1.16b}, [x2], x3 ld1 {v0.16b}, [x0], x1 ld1 {v3.16b}, [x2], x3 ld1 {v2.16b}, [x0], x1 usubl v16.8h, v0.8b, v1.8b usubl2 v20.8h, v0.16b, v1.16b ld1 {v5.16b}, [x2], x3 ld1 {v4.16b}, [x0], x1 usubl v17.8h, v2.8b, v3.8b usubl2 v21.8h, v2.16b, v3.16b ld1 {v7.16b}, [x2], x3 ld1 {v6.16b}, [x0], x1 usubl v18.8h, v4.8b, v5.8b usubl2 v22.8h, v4.16b, v5.16b usubl v19.8h, v6.8b, v7.8b usubl2 v23.8h, v6.16b, v7.16b SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h b satd_8x4v_8x8h_neon endfunc function pixel_sa8d_8x8_neon, export=1 mov x4, x30 bl pixel_sa8d_8x8_neon add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc function pixel_sa8d_16x16_neon, export=1 mov x4, x30 bl pixel_sa8d_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #8 add x2, x2, #8 bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h add v0.4s, v30.4s, v31.4s addv s0, v0.4s mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc .macro sa8d_satd_8x8 satd= function pixel_sa8d_\satd\()8x8_neon load_diff_fly_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h .ifc \satd, satd_ transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h transpose v4.4s, v6.4s, v24.4s, v26.4s transpose v5.4s, v7.4s, v25.4s, v27.4s transpose v24.4s, v26.4s, v0.4s, v2.4s transpose v25.4s, v27.4s, v1.4s, v3.4s abs v0.8h, v4.8h abs v1.8h, v5.8h abs v2.8h, v6.8h abs v3.8h, v7.8h abs v4.8h, v24.8h abs v5.8h, v25.8h abs v6.8h, v26.8h abs v7.8h, v27.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h add v26.8h, v0.8h, v1.8h add v27.8h, v2.8h, v3.8h .endif SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h transpose v20.8h, v21.8h, v16.8h, v17.8h transpose v4.8h, v5.8h, v0.8h, v1.8h transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h transpose v20.4s, v22.4s, v2.4s, v0.4s transpose v21.4s, v23.4s, v3.4s, v1.4s transpose v16.4s, v18.4s, v24.4s, v4.4s transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d transpose v18.2d, v22.2d, v2.2d, v6.2d transpose v19.2d, v23.2d, v3.2d, v7.2d abs v16.8h, v16.8h abs v20.8h, v20.8h abs v17.8h, v17.8h abs v21.8h, v21.8h abs v18.8h, v18.8h abs v22.8h, v22.8h abs v19.8h, v19.8h abs v23.8h, v23.8h umax v16.8h, v16.8h, v20.8h umax v17.8h, v17.8h, v21.8h umax v18.8h, v18.8h, v22.8h umax v19.8h, v19.8h, v23.8h add v0.8h, v16.8h, v17.8h add v1.8h, v18.8h, v19.8h ret endfunc .endm function pixel_sa8d_satd_16x16_neon, export=1 mov x4, x30 bl pixel_sa8d_satd_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h uaddlp v28.4s, v26.8h uaddlp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #8 add x2, x2, #8 bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h add v0.4s, v30.4s, v31.4s // sa8d add v1.4s, v28.4s, v29.4s // satd addv s0, v0.4s addv s1, v1.4s urshr v0.4s, v0.4s, #1 fmov w0, s0 fmov w1, s1 add x0, x0, x1, lsl #32 ret x4 endfunc .macro HADAMARD_AC w h function pixel_hadamard_ac_\w\()x\h\()_neon, export=1 movrel x5, mask_ac_4_8 mov x4, x30 ld1 {v30.8h,v31.8h}, [x5] movi v28.16b, #0 movi v29.16b, #0 bl hadamard_ac_8x8_neon .if \h > 8 bl hadamard_ac_8x8_neon .endif .if \w > 8 sub x0, x0, x1, lsl #3 add x0, x0, #8 bl hadamard_ac_8x8_neon .endif .if \w * \h == 256 sub x0, x0, x1, lsl #4 bl hadamard_ac_8x8_neon .endif addv s1, v29.4s addv s0, v28.4s mov w1, v1.s[0] mov w0, v0.s[0] lsr w1, w1, #2 lsr w0, w0, #1 orr x0, x0, x1, lsl #32 ret x4 endfunc .endm // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 function hadamard_ac_8x8_neon ld1 {v16.8b}, [x0], x1 ld1 {v17.8b}, [x0], x1 ld1 {v18.8b}, [x0], x1 ld1 {v19.8b}, [x0], x1 SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b ld1 {v20.8b}, [x0], x1 ld1 {v21.8b}, [x0], x1 SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b ld1 {v22.8b}, [x0], x1 ld1 {v23.8b}, [x0], x1 SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h abs v0.8h, v16.8h abs v4.8h, v20.8h abs v1.8h, v17.8h abs v5.8h, v21.8h abs v2.8h, v18.8h abs v6.8h, v22.8h abs v3.8h, v19.8h abs v7.8h, v23.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h and v0.16b, v0.16b, v30.16b add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h uadalp v28.4s, v0.8h uadalp v28.4s, v1.8h SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h transpose v16.2d, v17.2d, v6.2d, v7.2d transpose v18.2d, v19.2d, v4.2d, v5.2d transpose v20.2d, v21.2d, v2.2d, v3.2d abs v16.8h, v16.8h abs v17.8h, v17.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v20.8h, v20.8h abs v21.8h, v21.8h transpose v7.2d, v6.2d, v1.2d, v0.2d umax v3.8h, v16.8h, v17.8h umax v2.8h, v18.8h, v19.8h umax v1.8h, v20.8h, v21.8h SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h add v2.8h, v2.8h, v3.8h add v2.8h, v2.8h, v1.8h and v4.16b, v4.16b, v31.16b add v2.8h, v2.8h, v2.8h abs v5.8h, v5.8h abs v4.8h, v4.8h add v2.8h, v2.8h, v5.8h add v2.8h, v2.8h, v4.8h uadalp v29.4s, v2.8h ret endfunc function pixel_ssim_4x4x2_core_neon, export=1 ld1 {v0.8b}, [x0], x1 ld1 {v2.8b}, [x2], x3 umull v16.8h, v0.8b, v0.8b umull v17.8h, v0.8b, v2.8b umull v18.8h, v2.8b, v2.8b ld1 {v28.8b}, [x0], x1 ld1 {v29.8b}, [x2], x3 umull v20.8h, v28.8b, v28.8b umull v21.8h, v28.8b, v29.8b umull v22.8h, v29.8b, v29.8b uaddlp v16.4s, v16.8h uaddlp v17.4s, v17.8h uaddl v0.8h, v0.8b, v28.8b uadalp v16.4s, v18.8h uaddl v1.8h, v2.8b, v29.8b ld1 {v26.8b}, [x0], x1 ld1 {v27.8b}, [x2], x3 umull v23.8h, v26.8b, v26.8b umull v24.8h, v26.8b, v27.8b umull v25.8h, v27.8b, v27.8b uadalp v16.4s, v20.8h uaddw v0.8h, v0.8h, v26.8b uadalp v17.4s, v21.8h uaddw v1.8h, v1.8h, v27.8b uadalp v16.4s, v22.8h ld1 {v28.8b}, [x0], x1 ld1 {v29.8b}, [x2], x3 umull v20.8h, v28.8b, v28.8b umull v21.8h, v28.8b, v29.8b umull v22.8h, v29.8b, v29.8b uadalp v16.4s, v23.8h uaddw v0.8h, v0.8h, v28.8b uadalp v17.4s, v24.8h uaddw v1.8h, v1.8h, v29.8b uadalp v16.4s, v25.8h uadalp v16.4s, v20.8h uadalp v17.4s, v21.8h uadalp v16.4s, v22.8h uaddlp v0.4s, v0.8h uaddlp v1.4s, v1.8h addp v0.4s, v0.4s, v0.4s addp v1.4s, v1.4s, v1.4s addp v2.4s, v16.4s, v16.4s addp v3.4s, v17.4s, v17.4s st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4] ret endfunc function pixel_ssim_end4_neon, export=1 mov x5, #4 ld1 {v16.4s,v17.4s}, [x0], #32 ld1 {v18.4s,v19.4s}, [x1], #32 mov w4, #0x99bb subs x2, x5, w2, uxtw mov w3, #416 // ssim_c1 = .01*.01*255*255*64 movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63 add v0.4s, v16.4s, v18.4s add v1.4s, v17.4s, v19.4s add v0.4s, v0.4s, v1.4s ld1 {v20.4s,v21.4s}, [x0], #32 ld1 {v22.4s,v23.4s}, [x1], #32 add v2.4s, v20.4s, v22.4s add v3.4s, v21.4s, v23.4s add v1.4s, v1.4s, v2.4s ld1 {v16.4s}, [x0], #16 ld1 {v18.4s}, [x1], #16 add v16.4s, v16.4s, v18.4s add v2.4s, v2.4s, v3.4s add v3.4s, v3.4s, v16.4s dup v30.4s, w3 dup v31.4s, w4 transpose v4.4s, v5.4s, v0.4s, v1.4s transpose v6.4s, v7.4s, v2.4s, v3.4s transpose v0.2d, v2.2d, v4.2d, v6.2d transpose v1.2d, v3.2d, v5.2d, v7.2d mul v16.4s, v0.4s, v1.4s // s1*s2 mul v0.4s, v0.4s, v0.4s mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 shl v3.4s, v3.4s, #7 shl v2.4s, v2.4s, #6 add v1.4s, v16.4s, v16.4s sub v2.4s, v2.4s, v0.4s // vars sub v3.4s, v3.4s, v1.4s // covar*2 add v0.4s, v0.4s, v30.4s add v2.4s, v2.4s, v31.4s add v1.4s, v1.4s, v30.4s add v3.4s, v3.4s, v31.4s scvtf v0.4s, v0.4s scvtf v2.4s, v2.4s scvtf v1.4s, v1.4s scvtf v3.4s, v3.4s fmul v0.4s, v0.4s, v2.4s fmul v1.4s, v1.4s, v3.4s fdiv v0.4s, v1.4s, v0.4s b.eq 1f movrel x3, mask add x3, x3, x2, lsl #2 ld1 {v29.4s}, [x3] and v0.16b, v0.16b, v29.16b 1: faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s ret endfunc #else /* BIT_DEPTH == 8 */ .macro SAD_START_4 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 uabdl v16.4s, v0.4h, v1.4h uabdl2 v18.4s, v0.8h, v1.8h .endm .macro SAD_4 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 uabal v16.4s, v0.4h, v1.4h uabal2 v18.4s, v0.8h, v1.8h .endm .macro SAD_START_8 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 uabdl v16.4s, v0.4h, v1.4h uabdl2 v17.4s, v0.8h, v1.8h uabdl v18.4s, v2.4h, v3.4h uabdl2 v19.4s, v2.8h, v3.8h .endm .macro SAD_8 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 uabal v16.4s, v0.4h, v1.4h uabal2 v17.4s, v0.8h, v1.8h uabal v18.4s, v2.4h, v3.4h uabal2 v19.4s, v2.8h, v3.8h .endm .macro SAD_START_16 lsl x1, x1, #1 lsl x3, x3, #1 ld2 {v0.8h, v1.8h}, [x2], x3 ld2 {v2.8h, v3.8h}, [x0], x1 ld2 {v4.8h, v5.8h}, [x2], x3 ld2 {v6.8h, v7.8h}, [x0], x1 uabdl v16.4s, v0.4h, v2.4h uabdl2 v17.4s, v0.8h, v2.8h uabdl v20.4s, v1.4h, v3.4h uabdl2 v21.4s, v1.8h, v3.8h uabdl v18.4s, v4.4h, v6.4h uabdl2 v19.4s, v4.8h, v6.8h uabdl v22.4s, v5.4h, v7.4h uabdl2 v23.4s, v5.8h, v7.8h .endm .macro SAD_16 ld2 {v0.8h, v1.8h}, [x2], x3 ld2 {v2.8h, v3.8h}, [x0], x1 ld2 {v4.8h, v5.8h}, [x2], x3 ld2 {v6.8h, v7.8h}, [x0], x1 uabal v16.4s, v0.4h, v2.4h uabal2 v17.4s, v0.8h, v2.8h uabal v20.4s, v1.4h, v3.4h uabal2 v21.4s, v1.8h, v3.8h uabal v18.4s, v4.4h, v6.4h uabal2 v19.4s, v4.8h, v6.8h uabal v22.4s, v5.4h, v7.4h uabal2 v23.4s, v5.8h, v7.8h .endm .macro SAD_FUNC w, h, name function pixel_sad\name\()_\w\()x\h\()_neon, export=1 SAD_START_\w .rept \h / 2 - 1 SAD_\w .endr .if \w > 8 add v20.4s, v20.4s, v21.4s add v16.4s, v16.4s, v20.4s add v22.4s, v22.4s, v23.4s add v18.4s, v18.4s, v22.4s .endif .if \w > 4 add v16.4s, v16.4s, v17.4s add v18.4s, v18.4s, v19.4s .endif add v16.4s, v16.4s, v18.4s uaddlv s0, v16.8h fmov w0, s0 ret endfunc .endm .macro SAD_X_4 x, first=uaba ld1 {v0.d}[0], [x0], x7 ld1 {v1.d}[0], [x1], x5 ld1 {v0.d}[1], [x0], x7 ld1 {v1.d}[1], [x1], x5 ld1 {v2.d}[0], [x2], x5 ld1 {v2.d}[1], [x2], x5 \first v16.8h, v1.8h, v0.8h ld1 {v3.d}[0], [x3], x5 ld1 {v3.d}[1], [x3], x5 \first v17.8h, v2.8h, v0.8h .if \x == 4 ld1 {v4.d}[0], [x4], x5 ld1 {v4.d}[1], [x4], x5 .endif \first v18.8h, v3.8h, v0.8h .if \x == 4 \first v19.8h, v4.8h, v0.8h .endif .endm .macro SAD_X_8 x, first=uaba ld1 {v0.8h}, [x0], x7 ld1 {v1.8h}, [x1], x5 \first v16.8h, v1.8h, v0.8h ld1 {v2.8h}, [x2], x5 ld1 {v3.8h}, [x3], x5 \first v17.8h, v2.8h, v0.8h ld1 {v5.8h}, [x0], x7 ld1 {v1.8h}, [x1], x5 \first v18.8h, v3.8h, v0.8h ld1 {v2.8h}, [x2], x5 uaba v16.8h, v1.8h, v5.8h ld1 {v3.8h}, [x3], x5 uaba v17.8h, v2.8h, v5.8h .if \x == 4 ld1 {v4.8h}, [x4], x5 ld1 {v1.8h}, [x4], x5 .endif uaba v18.8h, v3.8h, v5.8h .if \x == 4 \first v19.8h, v4.8h, v0.8h uaba v19.8h, v1.8h, v5.8h .endif .endm .macro SAD_X_16 x, first=uaba ld1 {v0.8h, v1.8h}, [x0], x7 ld1 {v2.8h, v3.8h}, [x1], x5 ld1 {v4.8h, v5.8h}, [x2], x5 \first v16.8h, v2.8h, v0.8h \first v20.8h, v3.8h, v1.8h ld1 {v24.8h, v25.8h}, [x3], x5 \first v17.8h, v4.8h, v0.8h \first v21.8h, v5.8h, v1.8h ld1 {v6.8h, v7.8h}, [x0], x7 ld1 {v2.8h, v3.8h}, [x1], x5 \first v18.8h, v24.8h, v0.8h \first v22.8h, v25.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x5 uaba v16.8h, v2.8h, v6.8h uaba v20.8h, v3.8h, v7.8h ld1 {v24.8h, v25.8h}, [x3], x5 uaba v17.8h, v4.8h, v6.8h uaba v21.8h, v5.8h, v7.8h .if \x == 4 ld1 {v26.8h, v27.8h}, [x4], x5 ld1 {v28.8h, v29.8h}, [x4], x5 .endif uaba v18.8h, v24.8h, v6.8h uaba v22.8h, v25.8h, v7.8h .if \x == 4 \first v19.8h, v26.8h, v0.8h \first v23.8h, v27.8h, v1.8h uaba v19.8h, v28.8h, v6.8h uaba v23.8h, v29.8h, v7.8h .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .if \x == 3 mov x6, x5 mov x5, x4 .endif mov x7, #FENC_STRIDE lsl x5, x5, #1 lsl x7, x7, #1 SAD_X_\w \x, uabd .rept \h / 2 - 1 SAD_X_\w \x .endr .if \w > 8 add v16.8h, v16.8h, v20.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h .if \x == 4 add v19.8h, v19.8h, v23.8h .endif .endif // add up the sads uaddlv s0, v16.8h uaddlv s1, v17.8h uaddlv s2, v18.8h stp s0, s1, [x6], #8 .if \x == 3 str s2, [x6] .else uaddlv s3, v19.8h stp s2, s3, [x6] .endif ret endfunc .endm function pixel_vsad_neon, export=1 subs w2, w2, #2 lsl x1, x1, #1 ld1 {v0.8h, v1.8h}, [x0], x1 ld1 {v2.8h, v3.8h}, [x0], x1 uabd v6.8h, v0.8h, v2.8h uabd v7.8h, v1.8h, v3.8h b.le 2f 1: subs w2, w2, #2 ld1 {v0.8h, v1.8h}, [x0], x1 uaba v6.8h, v2.8h, v0.8h uaba v7.8h, v3.8h, v1.8h ld1 {v2.8h, v3.8h}, [x0], x1 b.lt 2f uaba v6.8h, v0.8h, v2.8h uaba v7.8h, v1.8h, v3.8h b.gt 1b 2: add v5.8h, v6.8h, v7.8h uaddlv s0, v5.8h fmov w0, s0 ret endfunc function pixel_asd8_neon, export=1 sub w4, w4, #2 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v0.8h}, [x0], x1 ld1 {v1.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 sub v16.8h, v0.8h, v1.8h 1: subs w4, w4, #2 ld1 {v4.8h}, [x0], x1 ld1 {v5.8h}, [x2], x3 sub v17.8h, v2.8h, v3.8h sub v18.8h, v4.8h, v5.8h add v16.8h, v16.8h, v17.8h ld1 {v2.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 add v16.8h, v16.8h, v18.8h b.gt 1b sub v17.8h, v2.8h, v3.8h add v16.8h, v16.8h, v17.8h saddlv s0, v16.8h abs v0.4s, v0.4s fmov w0, s0 ret endfunc .macro SSD_START_4 ld1 {v16.d}[0], [x0], x1 ld1 {v17.d}[0], [x2], x3 sub v2.4h, v16.4h, v17.4h ld1 {v16.d}[0], [x0], x1 ld1 {v17.d}[0], [x2], x3 smull v0.4s, v2.4h, v2.4h .endm .macro SSD_4 sub v2.4h, v16.4h, v17.4h ld1 {v16.d}[0], [x0], x1 ld1 {v17.d}[0], [x2], x3 smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_END_4 sub v2.4h, v16.4h, v17.4h smlal v0.4s, v2.4h, v2.4h .endm .macro SSD_START_8 ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x2], x3 sub v2.8h, v16.8h, v17.8h ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x2], x3 smull v0.4s, v2.4h, v2.4h smull2 v20.4s, v2.8h, v2.8h .endm .macro SSD_8 sub v2.8h, v16.8h, v17.8h ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x2], x3 smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h .endm .macro SSD_END_8 sub v2.8h, v16.8h, v17.8h smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h add v0.4s, v0.4s, v20.4s .endm .macro SSD_START_16 ld1 {v16.8h, v17.8h}, [x0], x1 ld1 {v18.8h, v19.8h}, [x2], x3 sub v2.8h, v16.8h, v18.8h sub v3.8h, v17.8h, v19.8h ld1 {v16.8h, v17.8h}, [x0], x1 smull v0.4s, v2.4h, v2.4h smull2 v20.4s, v2.8h, v2.8h ld1 {v18.8h, v19.8h}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v20.4s, v3.8h, v3.8h .endm .macro SSD_16 sub v2.8h, v16.8h, v18.8h sub v3.8h, v17.8h, v19.8h ld1 {v16.8h, v17.8h}, [x0], x1 smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h ld1 {v18.8h, v19.8h}, [x2], x3 smlal v0.4s, v3.4h, v3.4h smlal2 v20.4s, v3.8h, v3.8h .endm .macro SSD_END_16 sub v2.8h, v16.8h, v18.8h sub v3.8h, v17.8h, v19.8h smlal v0.4s, v2.4h, v2.4h smlal2 v20.4s, v2.8h, v2.8h smlal v0.4s, v3.4h, v3.4h smlal2 v20.4s, v3.8h, v3.8h add v0.4s, v0.4s, v20.4s .endm .macro SSD_FUNC w h function pixel_ssd_\w\()x\h\()_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 SSD_START_\w .rept \h-2 SSD_\w .endr SSD_END_\w addv s0, v0.4s fmov w0, s0 ret endfunc .endm function pixel_satd_4x4_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 sub v0.8h, v0.8h, v1.8h sub v1.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h zip1 v0.2d, v2.2d, v3.2d zip2 v1.2d, v2.2d, v3.2d SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.8h, v2.8h, v3.8h trn2 v1.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h trn1 v0.4s, v2.4s, v3.4s trn2 v1.4s, v2.4s, v3.4s abs v0.8h, v0.8h abs v1.8h, v1.8h umax v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h fmov w0, s0 ret endfunc function pixel_satd_4x8_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v5.d}[0], [x2], x3 ld1 {v4.d}[0], [x0], x1 ld1 {v7.d}[0], [x2], x3 ld1 {v6.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 ld1 {v5.d}[1], [x2], x3 ld1 {v4.d}[1], [x0], x1 ld1 {v7.d}[1], [x2], x3 ld1 {v6.d}[1], [x0], x1 b satd_4x8_8x4_end_neon endfunc function pixel_satd_8x4_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 ld1 {v5.8h}, [x2], x3 ld1 {v4.8h}, [x0], x1 ld1 {v7.8h}, [x2], x3 ld1 {v6.8h}, [x0], x1 endfunc function satd_4x8_8x4_end_neon sub v0.8h, v0.8h, v1.8h sub v1.8h, v2.8h, v3.8h sub v2.8h, v4.8h, v5.8h sub v3.8h, v6.8h, v7.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h trn1 v0.8h, v4.8h, v5.8h trn2 v1.8h, v4.8h, v5.8h trn1 v2.8h, v6.8h, v7.8h trn2 v3.8h, v6.8h, v7.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h trn1 v0.4s, v16.4s, v18.4s trn2 v1.4s, v16.4s, v18.4s trn1 v2.4s, v17.4s, v19.4s trn2 v3.4s, v17.4s, v19.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h umax v0.8h, v0.8h, v1.8h umax v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret endfunc function pixel_satd_4x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v5.d}[0], [x2], x3 ld1 {v4.d}[0], [x0], x1 ld1 {v7.d}[0], [x2], x3 ld1 {v6.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 ld1 {v5.d}[1], [x2], x3 ld1 {v4.d}[1], [x0], x1 ld1 {v7.d}[1], [x2], x3 ld1 {v6.d}[1], [x0], x1 sub v16.8h, v0.8h, v1.8h sub v17.8h, v2.8h, v3.8h sub v18.8h, v4.8h, v5.8h sub v19.8h, v6.8h, v7.8h ld1 {v1.d}[0], [x2], x3 ld1 {v0.d}[0], [x0], x1 ld1 {v3.d}[0], [x2], x3 ld1 {v2.d}[0], [x0], x1 ld1 {v5.d}[0], [x2], x3 ld1 {v4.d}[0], [x0], x1 ld1 {v7.d}[0], [x2], x3 ld1 {v6.d}[0], [x0], x1 ld1 {v1.d}[1], [x2], x3 ld1 {v0.d}[1], [x0], x1 ld1 {v3.d}[1], [x2], x3 ld1 {v2.d}[1], [x0], x1 ld1 {v5.d}[1], [x2], x3 ld1 {v4.d}[1], [x0], x1 ld1 {v7.d}[1], [x2], x3 ld1 {v6.d}[1], [x0], x1 sub v20.8h, v0.8h, v1.8h sub v21.8h, v2.8h, v3.8h sub v22.8h, v4.8h, v5.8h sub v23.8h, v6.8h, v7.8h SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h bl satd_8x4v_8x8h_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h fmov w0, s0 ret x4 endfunc .macro load_diff_fly_8x8 ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 sub v16.8h, v0.8h, v1.8h ld1 {v5.8h}, [x2], x3 ld1 {v4.8h}, [x0], x1 sub v17.8h, v2.8h, v3.8h ld1 {v7.8h}, [x2], x3 ld1 {v6.8h}, [x0], x1 sub v18.8h, v4.8h, v5.8h ld1 {v1.8h}, [x2], x3 ld1 {v0.8h}, [x0], x1 sub v19.8h, v6.8h, v7.8h ld1 {v3.8h}, [x2], x3 ld1 {v2.8h}, [x0], x1 sub v20.8h, v0.8h, v1.8h ld1 {v5.8h}, [x2], x3 ld1 {v4.8h}, [x0], x1 sub v21.8h, v2.8h, v3.8h ld1 {v7.8h}, [x2], x3 ld1 {v6.8h}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h sub v22.8h, v4.8h, v5.8h sub v23.8h, v6.8h, v7.8h .endm function pixel_satd_8x8_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_8x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v0.8h, v1.8h bl satd_8x8_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v31.8h, v0.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function satd_8x8_neon load_diff_fly_8x8 endfunc // one vertical hadamard pass and two horizontal function satd_8x4v_8x8h_neon SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s abs v0.8h, v0.8h abs v1.8h, v1.8h abs v2.8h, v2.8h abs v3.8h, v3.8h abs v4.8h, v4.8h abs v5.8h, v5.8h abs v6.8h, v6.8h abs v7.8h, v7.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h ret endfunc function pixel_ssd_nv12_core_neon, export=1 sxtw x8, w4 add x8, x8, #8 and x8, x8, #~15 movi v6.2d, #0 movi v7.2d, #0 sub x1, x1, x8, lsl #1 sub x3, x3, x8, lsl #1 lsl x1, x1, #1 lsl x3, x3, #1 lsl x4, x4, #1 1: subs w8, w4, #32 ld2 {v0.8h, v1.8h}, [x0], #32 ld2 {v2.8h, v3.8h}, [x2], #32 ld2 {v24.8h, v25.8h}, [x0], #32 ld2 {v26.8h, v27.8h}, [x2], #32 sub v16.8h, v0.8h, v2.8h sub v17.8h, v1.8h, v3.8h smull v20.4s, v16.4h, v16.4h smull v21.4s, v17.4h, v17.4h sub v18.8h, v24.8h, v26.8h sub v19.8h, v25.8h, v27.8h smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f b.eq 3f 2: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h ld2 {v0.8h, v1.8h}, [x0], #32 ld2 {v2.8h, v3.8h}, [x2], #32 smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h subs w8, w8, #32 sub v16.8h, v0.8h, v2.8h sub v17.8h, v1.8h, v3.8h smlal v20.4s, v16.4h, v16.4h smlal v21.4s, v17.4h, v17.4h ld2 {v24.8h,v25.8h}, [x0], #32 ld2 {v26.8h,v27.8h}, [x2], #32 smlal2 v20.4s, v16.8h, v16.8h smlal2 v21.4s, v17.8h, v17.8h b.lt 4f sub v18.8h, v24.8h, v26.8h sub v19.8h, v25.8h, v27.8h b.gt 2b 3: smlal v20.4s, v18.4h, v18.4h smlal v21.4s, v19.4h, v19.4h smlal2 v20.4s, v18.8h, v18.8h smlal2 v21.4s, v19.8h, v19.8h 4: subs w5, w5, #1 uaddw v6.2d, v6.2d, v20.2s uaddw v7.2d, v7.2d, v21.2s add x0, x0, x1 add x2, x2, x3 uaddw2 v6.2d, v6.2d, v20.4s uaddw2 v7.2d, v7.2d, v21.4s b.gt 1b addp v6.2d, v6.2d, v7.2d st1 {v6.d}[0], [x6] st1 {v6.d}[1], [x7] ret endfunc .macro pixel_var_8 h function pixel_var_8x\h\()_neon, export=1 lsl x1, x1, #1 ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x0], x1 mov x2, \h - 4 umull v1.4s, v16.4h, v16.4h umull2 v30.4s, v16.8h, v16.8h mov v0.16b, v16.16b umull v2.4s, v17.4h, v17.4h umull2 v31.4s, v17.8h, v17.8h add v0.8h, v0.8h, v17.8h ld1 {v18.8h}, [x0], x1 ld1 {v19.8h}, [x0], x1 1: subs x2, x2, #4 add v0.8h, v0.8h, v18.8h umull v24.4s, v18.4h, v18.4h umull2 v25.4s, v18.8h, v18.8h ld1 {v20.8h}, [x0], x1 add v0.8h, v0.8h, v19.8h umull v26.4s, v19.4h, v19.4h umull2 v27.4s, v19.8h, v19.8h add v1.4s, v1.4s, v24.4s add v30.4s, v30.4s, v25.4s ld1 {v21.8h}, [x0], x1 add v0.8h, v0.8h, v20.8h umull v28.4s, v20.4h, v20.4h umull2 v29.4s, v20.8h, v20.8h add v2.4s, v2.4s, v26.4s add v31.4s, v31.4s, v27.4s ld1 {v18.8h}, [x0], x1 add v0.8h, v0.8h, v21.8h umull v3.4s, v21.4h, v21.4h umull2 v4.4s, v21.8h, v21.8h add v1.4s, v1.4s, v28.4s add v30.4s, v30.4s, v29.4s ld1 {v19.8h}, [x0], x1 add v2.4s, v2.4s, v3.4s add v31.4s, v31.4s, v4.4s b.gt 1b add v0.8h, v0.8h, v18.8h umull v24.4s, v18.4h, v18.4h umull2 v25.4s, v18.8h, v18.8h add v0.8h, v0.8h, v19.8h umull v26.4s, v19.4h, v19.4h umull2 v27.4s, v19.8h, v19.8h add v1.4s, v1.4s, v24.4s add v30.4s, v30.4s, v25.4s add v2.4s, v2.4s, v26.4s add v31.4s, v31.4s, v27.4s b var_end endfunc .endm function pixel_var_16x16_neon, export=1 lsl x1, x1, #1 ld1 {v16.8h, v17.8h}, [x0], x1 ld1 {v18.8h, v19.8h}, [x0], x1 mov x2, #14 umull v1.4s, v16.4h, v16.4h umull2 v30.4s, v16.8h, v16.8h add v0.8h, v16.8h, v17.8h umull v2.4s, v17.4h, v17.4h umull2 v31.4s, v17.8h, v17.8h 1: subs x2, x2, #2 ld1 {v20.8h, v21.8h}, [x0], x1 add v0.8h, v0.8h, v18.8h umlal v1.4s, v18.4h, v18.4h umlal2 v30.4s, v18.8h, v18.8h umlal v2.4s, v19.4h, v19.4h umlal2 v31.4s, v19.8h, v19.8h add v0.8h, v0.8h, v19.8h ld1 {v18.8h, v19.8h}, [x0], x1 add v0.8h, v0.8h, v20.8h umlal v1.4s, v20.4h, v20.4h umlal2 v30.4s, v20.8h, v20.8h umlal v2.4s, v21.4h, v21.4h umlal2 v31.4s, v21.8h, v21.8h add v0.8h, v0.8h, v21.8h b.gt 1b add v0.8h, v0.8h, v18.8h umlal v1.4s, v18.4h, v18.4h umlal2 v30.4s, v18.8h, v18.8h umlal v2.4s, v19.4h, v19.4h umlal2 v31.4s, v19.8h, v19.8h add v0.8h, v0.8h, v19.8h endfunc function var_end add v1.4s, v1.4s, v2.4s add v30.4s, v30.4s, v31.4s add v1.4s, v1.4s, v30.4s uaddlv s0, v0.8h uaddlv d1, v1.4s mov w0, v0.s[0] mov x1, v1.d[0] orr x0, x0, x1, lsl #32 ret endfunc .macro pixel_var2_8 h function pixel_var2_8x\h\()_neon, export=1 mov x3, #32 ld1 {v16.8h}, [x0], #16 ld1 {v18.8h}, [x1], x3 ld1 {v17.8h}, [x0], #16 ld1 {v19.8h}, [x1], x3 mov x5, \h - 2 sub v0.8h, v16.8h, v18.8h sub v1.8h, v17.8h, v19.8h ld1 {v16.8h}, [x0], #16 ld1 {v18.8h}, [x1], x3 smull v2.4s, v0.4h, v0.4h smull2 v3.4s, v0.8h, v0.8h smull v4.4s, v1.4h, v1.4h smull2 v5.4s, v1.8h, v1.8h sub v6.8h, v16.8h, v18.8h 1: subs x5, x5, #1 ld1 {v17.8h}, [x0], #16 ld1 {v19.8h}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h sub v7.8h, v17.8h, v19.8h add v0.8h, v0.8h, v6.8h ld1 {v16.8h}, [x0], #16 ld1 {v18.8h}, [x1], x3 smlal v4.4s, v7.4h, v7.4h smlal2 v5.4s, v7.8h, v7.8h sub v6.8h, v16.8h, v18.8h add v1.8h, v1.8h, v7.8h b.gt 1b ld1 {v17.8h}, [x0], #16 ld1 {v19.8h}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h sub v7.8h, v17.8h, v19.8h add v0.8h, v0.8h, v6.8h smlal v4.4s, v7.4h, v7.4h add v1.8h, v1.8h, v7.8h smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] mov w1, v1.s[0] addv s2, v2.4s addv s4, v4.4s mul w0, w0, w0 mul w1, w1, w1 mov w3, v2.s[0] mov w4, v4.s[0] sub w0, w3, w0, lsr # 6 + (\h >> 4) sub w1, w4, w1, lsr # 6 + (\h >> 4) str w3, [x2] add w0, w0, w1 str w4, [x2, #4] ret endfunc .endm function pixel_satd_16x8_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_16x4_neon add v30.8h, v0.8h, v1.8h add v31.8h, v2.8h, v3.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h add v30.8h, v30.8h, v0.8h add v31.8h, v31.8h, v1.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc function pixel_satd_16x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl satd_16x4_neon uaddl v30.4s, v0.4h, v1.4h uaddl v31.4s, v2.4h, v3.4h uaddl2 v28.4s, v0.8h, v1.8h uaddl2 v29.4s, v2.8h, v3.8h add v30.4s, v30.4s, v28.4s add v31.4s, v31.4s, v29.4s bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h uaddw v30.4s, v30.4s, v0.4h uaddw2 v30.4s, v30.4s, v0.8h uaddw v31.4s, v31.4s, v1.4h uaddw2 v31.4s, v31.4s, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h uaddw v30.4s, v30.4s, v0.4h uaddw2 v30.4s, v30.4s, v0.8h uaddw v31.4s, v31.4s, v1.4h uaddw2 v31.4s, v31.4s, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h add v1.8h, v2.8h, v3.8h uaddw v30.4s, v30.4s, v0.4h uaddw2 v30.4s, v30.4s, v0.8h uaddw v31.4s, v31.4s, v1.4h uaddw2 v31.4s, v31.4s, v1.8h add v0.4s, v30.4s, v31.4s addv s0, v0.4s mov w0, v0.s[0] ret x4 endfunc function satd_16x4_neon ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x0], x1 sub v16.8h, v2.8h, v0.8h sub v20.8h, v3.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x0], x1 sub v17.8h, v6.8h, v4.8h sub v21.8h, v7.8h, v5.8h ld1 {v0.8h, v1.8h}, [x2], x3 ld1 {v2.8h, v3.8h}, [x0], x1 sub v18.8h, v2.8h, v0.8h sub v22.8h, v3.8h, v1.8h ld1 {v4.8h, v5.8h}, [x2], x3 ld1 {v6.8h, v7.8h}, [x0], x1 sub v19.8h, v6.8h, v4.8h sub v23.8h, v7.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h b satd_8x4v_8x8h_neon endfunc function pixel_sa8d_8x8_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl pixel_sa8d_8x8_neon add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc function pixel_sa8d_16x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl pixel_sa8d_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #16 add x2, x2, #16 bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h add v0.4s, v30.4s, v31.4s addv s0, v0.4s mov w0, v0.s[0] add w0, w0, #1 lsr w0, w0, #1 ret x4 endfunc .macro sa8d_satd_8x8 satd= function pixel_sa8d_\satd\()8x8_neon load_diff_fly_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h .ifc \satd, satd_ transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h transpose v4.4s, v6.4s, v24.4s, v26.4s transpose v5.4s, v7.4s, v25.4s, v27.4s transpose v24.4s, v26.4s, v0.4s, v2.4s transpose v25.4s, v27.4s, v1.4s, v3.4s abs v0.8h, v4.8h abs v1.8h, v5.8h abs v2.8h, v6.8h abs v3.8h, v7.8h abs v4.8h, v24.8h abs v5.8h, v25.8h abs v6.8h, v26.8h abs v7.8h, v27.8h umax v0.8h, v0.8h, v2.8h umax v1.8h, v1.8h, v3.8h umax v2.8h, v4.8h, v6.8h umax v3.8h, v5.8h, v7.8h add v26.8h, v0.8h, v1.8h add v27.8h, v2.8h, v3.8h .endif SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h transpose v20.8h, v21.8h, v16.8h, v17.8h transpose v4.8h, v5.8h, v0.8h, v1.8h transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h transpose v20.4s, v22.4s, v2.4s, v0.4s transpose v21.4s, v23.4s, v3.4s, v1.4s transpose v16.4s, v18.4s, v24.4s, v4.4s transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d transpose v18.2d, v22.2d, v2.2d, v6.2d transpose v19.2d, v23.2d, v3.2d, v7.2d abs v16.8h, v16.8h abs v20.8h, v20.8h abs v17.8h, v17.8h abs v21.8h, v21.8h abs v18.8h, v18.8h abs v22.8h, v22.8h abs v19.8h, v19.8h abs v23.8h, v23.8h umax v16.8h, v16.8h, v20.8h umax v17.8h, v17.8h, v21.8h umax v18.8h, v18.8h, v22.8h umax v19.8h, v19.8h, v23.8h add v0.8h, v16.8h, v17.8h add v1.8h, v18.8h, v19.8h ret endfunc .endm function pixel_sa8d_satd_16x16_neon, export=1 mov x4, x30 lsl x1, x1, #1 lsl x3, x3, #1 bl pixel_sa8d_satd_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h uaddlp v28.4s, v26.8h uaddlp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #16 add x2, x2, #16 bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h bl pixel_sa8d_satd_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h uadalp v28.4s, v26.8h uadalp v29.4s, v27.8h add v0.4s, v30.4s, v31.4s // sa8d add v1.4s, v28.4s, v29.4s // satd addv s0, v0.4s addv s1, v1.4s urshr v0.4s, v0.4s, #1 fmov w0, s0 fmov w1, s1 add x0, x0, x1, lsl #32 ret x4 endfunc .macro HADAMARD_AC w h function pixel_hadamard_ac_\w\()x\h\()_neon, export=1 movrel x5, mask_ac_4_8 mov x4, x30 lsl x1, x1, #1 ld1 {v30.8h,v31.8h}, [x5] movi v28.16b, #0 movi v29.16b, #0 bl hadamard_ac_8x8_neon .if \h > 8 bl hadamard_ac_8x8_neon .endif .if \w > 8 sub x0, x0, x1, lsl #3 add x0, x0, 16 bl hadamard_ac_8x8_neon .endif .if \w * \h == 256 sub x0, x0, x1, lsl #4 bl hadamard_ac_8x8_neon .endif addv s1, v29.4s addv s0, v28.4s mov w1, v1.s[0] mov w0, v0.s[0] lsr w1, w1, #2 lsr w0, w0, #1 orr x0, x0, x1, lsl #32 ret x4 endfunc .endm // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 function hadamard_ac_8x8_neon ld1 {v16.8h}, [x0], x1 ld1 {v17.8h}, [x0], x1 ld1 {v18.8h}, [x0], x1 ld1 {v19.8h}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h ld1 {v20.8h}, [x0], x1 ld1 {v21.8h}, [x0], x1 SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h ld1 {v22.8h}, [x0], x1 ld1 {v23.8h}, [x0], x1 SUMSUB_AB v4.8h, v5.8h, v20.8h, v21.8h SUMSUB_AB v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h transpose v0.4s, v2.4s, v16.4s, v18.4s transpose v1.4s, v3.4s, v17.4s, v19.4s transpose v4.4s, v6.4s, v20.4s, v22.4s transpose v5.4s, v7.4s, v21.4s, v23.4s SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h abs v0.8h, v16.8h abs v4.8h, v20.8h abs v1.8h, v17.8h abs v5.8h, v21.8h abs v2.8h, v18.8h abs v6.8h, v22.8h abs v3.8h, v19.8h abs v7.8h, v23.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h and v0.16b, v0.16b, v30.16b add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h uadalp v28.4s, v0.8h uadalp v28.4s, v1.8h SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h transpose v16.2d, v17.2d, v6.2d, v7.2d transpose v18.2d, v19.2d, v4.2d, v5.2d transpose v20.2d, v21.2d, v2.2d, v3.2d abs v16.8h, v16.8h abs v17.8h, v17.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v20.8h, v20.8h abs v21.8h, v21.8h transpose v7.2d, v6.2d, v1.2d, v0.2d umax v3.8h, v16.8h, v17.8h umax v2.8h, v18.8h, v19.8h umax v1.8h, v20.8h, v21.8h SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h add v2.8h, v2.8h, v3.8h add v2.8h, v2.8h, v1.8h and v4.16b, v4.16b, v31.16b add v2.8h, v2.8h, v2.8h abs v5.8h, v5.8h abs v4.8h, v4.8h add v2.8h, v2.8h, v5.8h add v2.8h, v2.8h, v4.8h uadalp v29.4s, v2.8h ret endfunc function pixel_ssim_4x4x2_core_neon, export=1 lsl x1, x1, #1 lsl x3, x3, #1 ld1 {v0.8h}, [x0], x1 ld1 {v2.8h}, [x2], x3 ld1 {v28.8h}, [x0], x1 ld1 {v29.8h}, [x2], x3 umull v16.4s, v0.4h, v0.4h umull2 v17.4s, v0.8h, v0.8h umull v18.4s, v0.4h, v2.4h umull2 v19.4s, v0.8h, v2.8h umlal v16.4s, v2.4h, v2.4h umlal2 v17.4s, v2.8h, v2.8h ld1 {v26.8h}, [x0], x1 ld1 {v27.8h}, [x2], x3 umlal v16.4s, v28.4h, v28.4h umlal2 v17.4s, v28.8h, v28.8h umlal v18.4s, v28.4h, v29.4h umlal2 v19.4s, v28.8h, v29.8h umlal v16.4s, v29.4h, v29.4h umlal2 v17.4s, v29.8h, v29.8h add v0.8h, v0.8h, v28.8h add v1.8h, v2.8h, v29.8h umlal v16.4s, v26.4h, v26.4h umlal2 v17.4s, v26.8h, v26.8h umlal v18.4s, v26.4h, v27.4h umlal2 v19.4s, v26.8h, v27.8h umlal v16.4s, v27.4h, v27.4h umlal2 v17.4s, v27.8h, v27.8h ld1 {v28.8h}, [x0], x1 ld1 {v29.8h}, [x2], x3 add v0.8h, v0.8h, v26.8h add v1.8h, v1.8h, v27.8h umlal v16.4s, v28.4h, v28.4h umlal2 v17.4s, v28.8h, v28.8h umlal v18.4s, v28.4h, v29.4h umlal2 v19.4s, v28.8h, v29.8h umlal v16.4s, v29.4h, v29.4h umlal2 v17.4s, v29.8h, v29.8h add v0.8h, v0.8h, v28.8h add v1.8h, v1.8h, v29.8h addp v16.4s, v16.4s, v17.4s addp v17.4s, v18.4s, v19.4s uaddlp v0.4s, v0.8h uaddlp v1.4s, v1.8h addp v0.4s, v0.4s, v0.4s addp v1.4s, v1.4s, v1.4s addp v2.4s, v16.4s, v16.4s addp v3.4s, v17.4s, v17.4s st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [x4] ret endfunc function pixel_ssim_end4_neon, export=1 mov x5, #4 ld1 {v16.4s, v17.4s}, [x0], #32 ld1 {v18.4s, v19.4s}, [x1], #32 subs x2, x5, w2, uxtw // These values must be stored in float, since with 10 bit depth edge cases // may overflow. The hexadecimal values are IEEE-754 representation of the // floating point numbers. ldr w3, =0x45d14e49 // ssim_c1 = .01*.01*1023*1023*64 ldr w4, =0x4a67ca32 // ssim_c2 = .03*.03*1023*1023*64*63 add v0.4s, v16.4s, v18.4s add v1.4s, v17.4s, v19.4s add v0.4s, v0.4s, v1.4s ld1 {v20.4s, v21.4s}, [x0], #32 ld1 {v22.4s, v23.4s}, [x1], #32 add v2.4s, v20.4s, v22.4s add v3.4s, v21.4s, v23.4s add v1.4s, v1.4s, v2.4s ld1 {v16.4s}, [x0], #16 ld1 {v18.4s}, [x1], #16 add v16.4s, v16.4s, v18.4s add v2.4s, v2.4s, v3.4s add v3.4s, v3.4s, v16.4s dup v30.4s, w3 dup v31.4s, w4 transpose v4.4s, v5.4s, v0.4s, v1.4s transpose v6.4s, v7.4s, v2.4s, v3.4s transpose v0.2d, v2.2d, v4.2d, v6.2d transpose v1.2d, v3.2d, v5.2d, v7.2d // Conversion to floating point number must occur earlier than in 8 bit case // because of the range overflow scvtf v0.4s, v0.4s scvtf v2.4s, v2.4s scvtf v1.4s, v1.4s scvtf v3.4s, v3.4s fmul v16.4s, v0.4s, v1.4s // s1*s2 fmul v0.4s, v0.4s, v0.4s fmla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 // IEEE-754 hexadecimal representation of multipliers ldr w3, =0x42800000 // 64 ldr w4, =0x43000000 // 128 dup v28.4s, w3 dup v29.4s, w4 fmul v2.4s, v2.4s, v28.4s fmul v3.4s, v3.4s, v29.4s fadd v1.4s, v16.4s, v16.4s fsub v2.4s, v2.4s, v0.4s // vars fsub v3.4s, v3.4s, v1.4s // covar*2 fadd v0.4s, v0.4s, v30.4s fadd v2.4s, v2.4s, v31.4s fadd v1.4s, v1.4s, v30.4s fadd v3.4s, v3.4s, v31.4s fmul v0.4s, v0.4s, v2.4s fmul v1.4s, v1.4s, v3.4s fdiv v0.4s, v1.4s, v0.4s b.eq 1f movrel x3, mask add x3, x3, x2, lsl #2 ld1 {v29.4s}, [x3] and v0.16b, v0.16b, v29.16b 1: faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s ret endfunc #endif /* BIT_DEPTH == 8 */ SAD_FUNC 4, 4 SAD_FUNC 4, 8 SAD_FUNC 4, 16 SAD_FUNC 8, 4 SAD_FUNC 8, 8 SAD_FUNC 8, 16 SAD_FUNC 16, 8 SAD_FUNC 16, 16 SAD_X_FUNC 3, 4, 4 SAD_X_FUNC 3, 4, 8 SAD_X_FUNC 3, 8, 4 SAD_X_FUNC 3, 8, 8 SAD_X_FUNC 3, 8, 16 SAD_X_FUNC 3, 16, 8 SAD_X_FUNC 3, 16, 16 SAD_X_FUNC 4, 4, 4 SAD_X_FUNC 4, 4, 8 SAD_X_FUNC 4, 8, 4 SAD_X_FUNC 4, 8, 8 SAD_X_FUNC 4, 8, 16 SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16 SSD_FUNC 4, 4 SSD_FUNC 4, 8 SSD_FUNC 4, 16 SSD_FUNC 8, 4 SSD_FUNC 8, 8 SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 pixel_var_8 8 pixel_var_8 16 pixel_var2_8 8 pixel_var2_8 16 sa8d_satd_8x8 sa8d_satd_8x8 satd_ HADAMARD_AC 8, 8 HADAMARD_AC 8, 16 HADAMARD_AC 16, 8 HADAMARD_AC 16, 16 #if BIT_DEPTH == 8 && HAVE_DOTPROD ENABLE_DOTPROD SAD_FUNC_DOTPROD 16, 8 SAD_FUNC_DOTPROD 16, 16 SAD_X_DOTPROD_FUNC 3, 16, 8 SAD_X_DOTPROD_FUNC 3, 16, 16 SAD_X_DOTPROD_FUNC 4, 16, 8 SAD_X_DOTPROD_FUNC 4, 16, 16 SSD_DOTPROD_FUNC 8, 4 SSD_DOTPROD_FUNC 8, 8 SSD_DOTPROD_FUNC 8, 16 SSD_DOTPROD_FUNC 16, 8 SSD_DOTPROD_FUNC 16, 16 DISABLE_DOTPROD #endif // BIT_DEPTH == 8 && HAVE_DOTPROD x264-master/common/aarch64/pixel.h000066400000000000000000000241351502133446700170760ustar00rootroot00000000000000/***************************************************************************** * pixel.h: aarch64 pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_PIXEL_H #define X264_AARCH64_PIXEL_H #define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon) #define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon) #define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon) #define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon) #define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon) #define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon) #define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon) #define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon) #define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon) #define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon) #define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon) #define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon) #define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon) #define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon) #define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon) #define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon) #define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon) #define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon) #define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon) #define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon) #define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon) #define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon) #define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon) #define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon) #define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon) #define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon) #define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon) #define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon) #define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon) #define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon) #define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon) #define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon) #define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon) #define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon) #define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon) #define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon) #define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon) #define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon) #if HAVE_DOTPROD #define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod) #define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod) #define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod) #define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod) #define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod) #define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod) #define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod) #define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod) #define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod) #define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod) #define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod) #endif // HAVE_DOTPROD #define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve) #define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve) #define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve) #define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve) #define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve) #define DECL_PIXELS( ret, name, suffix, args ) \ ret x264_pixel_##name##_16x16_##suffix args;\ ret x264_pixel_##name##_16x8_##suffix args;\ ret x264_pixel_##name##_8x16_##suffix args;\ ret x264_pixel_##name##_8x8_##suffix args;\ ret x264_pixel_##name##_8x4_##suffix args;\ ret x264_pixel_##name##_4x16_##suffix args;\ ret x264_pixel_##name##_4x8_##suffix args;\ ret x264_pixel_##name##_4x4_##suffix args; #define DECL_PIXELS_SSD_SVE( ret, args ) \ ret x264_pixel_ssd_8x8_sve args;\ ret x264_pixel_ssd_8x4_sve args;\ ret x264_pixel_ssd_4x16_sve args;\ ret x264_pixel_ssd_4x8_sve args;\ ret x264_pixel_ssd_4x4_sve args; #define DECL_X1( name, suffix ) \ DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) ) #define DECL_X1_SSD_SVE( ) \ DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) ) #define DECL_X4( name, suffix ) \ DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\ DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) ) DECL_X1( sad, neon ) DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) DECL_X1_SSD_SVE( ) #if HAVE_DOTPROD DECL_X1( sad, neon_dotprod ) DECL_X4( sad, neon_dotprod ) DECL_X1( ssd, neon_dotprod ) #endif // HAVE_DOTPROD #define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon) void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * ); #define x264_pixel_vsad_neon x264_template(pixel_vsad_neon) int x264_pixel_vsad_neon( pixel *, intptr_t, int ); #if HAVE_DOTPROD #define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod) int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int ); #endif // HAVE_DOTPROD #define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon) int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon) int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon) uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve) int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon) uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t ); #define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon) uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t ); #define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon) uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t ); #define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon) int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * ); #define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon) int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * ); #define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve) uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t ); #define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve) uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon) uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon) uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon) uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon) uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve) uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve) uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve) uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve) uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t ); #define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon) void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t, const pixel *, intptr_t, int sums[2][4] ); #define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon) float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); #define x264_pixel_asd8_neon x264_template(pixel_asd8_neon) int x264_pixel_asd8_neon( pixel *, intptr_t, pixel *, intptr_t, int ); #endif x264-master/common/aarch64/predict-a.S000066400000000000000000000700521502133446700175770ustar00rootroot00000000000000/***************************************************************************** * predict.S: aarch64 intra prediction ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Mans Rullgard * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" const p8weight, align=4 .short 1, 2, 3, 4, 1, 2, 3, 4 endconst const p16weight, align=4 .short 1, 2, 3, 4, 5, 6, 7, 8 endconst .macro ldcol.8 vd, xn, xm, n=8, hi=0 .if \n == 8 || \hi == 0 ld1 {\vd\().b}[0], [\xn], \xm ld1 {\vd\().b}[1], [\xn], \xm ld1 {\vd\().b}[2], [\xn], \xm ld1 {\vd\().b}[3], [\xn], \xm .endif .if \n == 8 || \hi == 1 ld1 {\vd\().b}[4], [\xn], \xm ld1 {\vd\().b}[5], [\xn], \xm ld1 {\vd\().b}[6], [\xn], \xm ld1 {\vd\().b}[7], [\xn], \xm .endif .endm .macro ldcol.16 vd, xn, xm ldcol.8 \vd, \xn, \xm ld1 {\vd\().b}[ 8], [\xn], \xm ld1 {\vd\().b}[ 9], [\xn], \xm ld1 {\vd\().b}[10], [\xn], \xm ld1 {\vd\().b}[11], [\xn], \xm ld1 {\vd\().b}[12], [\xn], \xm ld1 {\vd\().b}[13], [\xn], \xm ld1 {\vd\().b}[14], [\xn], \xm ld1 {\vd\().b}[15], [\xn], \xm .endm function predict_4x4_h_aarch64, export=1 ldurb w1, [x0, #0*FDEC_STRIDE-1] mov w5, #0x01010101 ldrb w2, [x0, #1*FDEC_STRIDE-1] ldrb w3, [x0, #2*FDEC_STRIDE-1] mul w1, w1, w5 ldrb w4, [x0, #3*FDEC_STRIDE-1] mul w2, w2, w5 str w1, [x0, #0*FDEC_STRIDE] mul w3, w3, w5 str w2, [x0, #1*FDEC_STRIDE] mul w4, w4, w5 str w3, [x0, #2*FDEC_STRIDE] str w4, [x0, #3*FDEC_STRIDE] ret endfunc function predict_4x4_v_aarch64, export=1 ldur w1, [x0, #0 - 1 * FDEC_STRIDE] str w1, [x0, #0 + 0 * FDEC_STRIDE] str w1, [x0, #0 + 1 * FDEC_STRIDE] str w1, [x0, #0 + 2 * FDEC_STRIDE] str w1, [x0, #0 + 3 * FDEC_STRIDE] ret endfunc function predict_4x4_dc_neon, export=1 sub x1, x0, #FDEC_STRIDE ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE] ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE] ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE] ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE] add w4, w4, w5 ldr s0, [x1] add w6, w6, w7 uaddlv h0, v0.8b add w4, w4, w6 dup v0.4h, v0.h[0] dup v1.4h, w4 add v0.4h, v0.4h, v1.4h rshrn v0.8b, v0.8h, #3 str s0, [x0] str s0, [x0, #1 * FDEC_STRIDE] str s0, [x0, #2 * FDEC_STRIDE] str s0, [x0, #3 * FDEC_STRIDE] ret endfunc function predict_4x4_dc_top_neon, export=1 sub x1, x0, #FDEC_STRIDE ldr s0, [x1] uaddlv h0, v0.8b dup v0.4h, v0.h[0] rshrn v0.8b, v0.8h, #2 str s0, [x0] str s0, [x0, #1 * FDEC_STRIDE] str s0, [x0, #2 * FDEC_STRIDE] str s0, [x0, #3 * FDEC_STRIDE] ret ret endfunc function predict_4x4_ddr_neon, export=1 sub x1, x0, #FDEC_STRIDE+1 mov x7, #FDEC_STRIDE ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 ext v0.8b, v1.8b, v0.8b, #7 ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 ext v0.8b, v2.8b, v0.8b, #7 // a ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 ext v1.8b, v3.8b, v0.8b, #7 // b ext v2.8b, v4.8b, v1.8b, #7 // c uaddl v0.8h, v0.8b, v1.8b uaddl v1.8h, v1.8b, v2.8b add v0.8h, v0.8h, v1.8h rshrn v0.8b, v0.8h, #2 ext v3.8b, v0.8b, v0.8b, #3 ext v2.8b, v0.8b, v0.8b, #2 ext v1.8b, v0.8b, v0.8b, #1 str s3, [x0], #FDEC_STRIDE str s2, [x0], #FDEC_STRIDE str s1, [x0], #FDEC_STRIDE str s0, [x0] ret endfunc function predict_4x4_ddl_neon, export=1 sub x0, x0, #FDEC_STRIDE mov x7, #FDEC_STRIDE ld1 {v0.8b}, [x0], x7 dup v3.8b, v0.b[7] ext v1.8b, v0.8b, v0.8b, #1 ext v2.8b, v0.8b, v3.8b, #2 uhadd v0.8b, v0.8b, v2.8b urhadd v0.8b, v0.8b, v1.8b str s0, [x0], #FDEC_STRIDE ext v1.8b, v0.8b, v0.8b, #1 ext v2.8b, v0.8b, v0.8b, #2 str s1, [x0], #FDEC_STRIDE ext v3.8b, v0.8b, v0.8b, #3 str s2, [x0], #FDEC_STRIDE str s3, [x0] ret endfunc function predict_8x8_dc_neon, export=1 mov x7, #FDEC_STRIDE ld1 {v0.16b}, [x1], #16 ld1 {v1.8b}, [x1] ext v0.16b, v0.16b, v0.16b, #7 uaddlv h1, v1.8b uaddlv h0, v0.8b add v0.8h, v0.8h, v1.8h dup v0.8h, v0.h[0] rshrn v0.8b, v0.8h, #4 .rept 8 st1 {v0.8b}, [x0], x7 .endr ret endfunc function predict_8x8_h_neon, export=1 mov x7, #FDEC_STRIDE ld1 {v16.16b}, [x1] dup v0.8b, v16.b[14] dup v1.8b, v16.b[13] st1 {v0.8b}, [x0], x7 dup v2.8b, v16.b[12] st1 {v1.8b}, [x0], x7 dup v3.8b, v16.b[11] st1 {v2.8b}, [x0], x7 dup v4.8b, v16.b[10] st1 {v3.8b}, [x0], x7 dup v5.8b, v16.b[9] st1 {v4.8b}, [x0], x7 dup v6.8b, v16.b[8] st1 {v5.8b}, [x0], x7 dup v7.8b, v16.b[7] st1 {v6.8b}, [x0], x7 st1 {v7.8b}, [x0], x7 ret endfunc function predict_8x8_v_neon, export=1 add x1, x1, #16 mov x7, #FDEC_STRIDE ld1 {v0.8b}, [x1] .rept 8 st1 {v0.8b}, [x0], x7 .endr ret endfunc function predict_8x8_ddl_neon, export=1 add x1, x1, #16 mov x7, #FDEC_STRIDE ld1 {v0.16b}, [x1] movi v3.16b, #0 dup v2.16b, v0.b[15] ext v4.16b, v3.16b, v0.16b, #15 ext v2.16b, v0.16b, v2.16b, #1 uhadd v4.16b, v4.16b, v2.16b urhadd v0.16b, v0.16b, v4.16b ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 st1 {v1.8b}, [x0], x7 ext v3.16b, v0.16b, v0.16b, #3 st1 {v2.8b}, [x0], x7 ext v4.16b, v0.16b, v0.16b, #4 st1 {v3.8b}, [x0], x7 ext v5.16b, v0.16b, v0.16b, #5 st1 {v4.8b}, [x0], x7 ext v6.16b, v0.16b, v0.16b, #6 st1 {v5.8b}, [x0], x7 ext v7.16b, v0.16b, v0.16b, #7 st1 {v6.8b}, [x0], x7 ext v0.16b, v0.16b, v0.16b, #8 st1 {v7.8b}, [x0], x7 st1 {v0.8b}, [x0], x7 ret endfunc function predict_8x8_ddr_neon, export=1 ld1 {v0.16b,v1.16b}, [x1] ext v2.16b, v0.16b, v1.16b, #7 ext v4.16b, v0.16b, v1.16b, #9 ext v3.16b, v0.16b, v1.16b, #8 uhadd v2.16b, v2.16b, v4.16b urhadd v7.16b, v3.16b, v2.16b add x0, x0, #7*FDEC_STRIDE mov x7, #-1*FDEC_STRIDE ext v6.16b, v7.16b, v7.16b, #1 st1 {v7.8b}, [x0], x7 ext v5.16b, v7.16b, v7.16b, #2 st1 {v6.8b}, [x0], x7 ext v4.16b, v7.16b, v7.16b, #3 st1 {v5.8b}, [x0], x7 ext v3.16b, v7.16b, v7.16b, #4 st1 {v4.8b}, [x0], x7 ext v2.16b, v7.16b, v7.16b, #5 st1 {v3.8b}, [x0], x7 ext v1.16b, v7.16b, v7.16b, #6 st1 {v2.8b}, [x0], x7 ext v0.16b, v7.16b, v7.16b, #7 st1 {v1.8b}, [x0], x7 st1 {v0.8b}, [x0], x7 ret endfunc function predict_8x8_vl_neon, export=1 add x1, x1, #16 mov x7, #FDEC_STRIDE ld1 {v0.16b}, [x1] ext v1.16b, v1.16b, v0.16b, #15 ext v2.16b, v0.16b, v2.16b, #1 uhadd v1.16b, v1.16b, v2.16b urhadd v3.16b, v0.16b, v2.16b urhadd v0.16b, v0.16b, v1.16b ext v4.16b, v0.16b, v0.16b, #1 st1 {v3.8b}, [x0], x7 ext v5.16b, v3.16b, v3.16b, #1 st1 {v4.8b}, [x0], x7 ext v6.16b, v0.16b, v0.16b, #2 st1 {v5.8b}, [x0], x7 ext v7.16b, v3.16b, v3.16b, #2 st1 {v6.8b}, [x0], x7 ext v4.16b, v0.16b, v0.16b, #3 st1 {v7.8b}, [x0], x7 ext v5.16b, v3.16b, v3.16b, #3 st1 {v4.8b}, [x0], x7 ext v6.16b, v0.16b, v0.16b, #4 st1 {v5.8b}, [x0], x7 st1 {v6.8b}, [x0], x7 ret endfunc function predict_8x8_vr_neon, export=1 add x1, x1, #8 mov x7, #FDEC_STRIDE ld1 {v2.16b}, [x1] ext v1.16b, v2.16b, v2.16b, #14 ext v0.16b, v2.16b, v2.16b, #15 uhadd v3.16b, v2.16b, v1.16b urhadd v2.16b, v2.16b, v0.16b urhadd v0.16b, v0.16b, v3.16b ext v1.16b, v2.16b, v2.16b, #8 uzp1 v2.8b, v0.8b, v0.8b uzp2 v3.8b, v0.8b, v0.8b ext v0.16b, v0.16b, v0.16b, #8 st1 {v1.8b}, [x0], x7 st1 {v0.8b}, [x0], x7 ext v4.8b, v3.8b, v1.8b, #7 ext v5.8b, v2.8b, v0.8b, #7 st1 {v4.8b}, [x0], x7 st1 {v5.8b}, [x0], x7 ext v6.8b, v3.8b, v1.8b, #6 ext v7.8b, v2.8b, v0.8b, #6 st1 {v6.8b}, [x0], x7 st1 {v7.8b}, [x0], x7 ext v1.8b, v3.8b, v1.8b, #5 ext v0.8b, v2.8b, v0.8b, #5 st1 {v1.8b}, [x0], x7 st1 {v0.8b}, [x0], x7 ret endfunc function predict_8x8_hd_neon, export=1 add x1, x1, #7 mov x7, #FDEC_STRIDE ld1 {v1.16b}, [x1] ext v3.16b, v1.16b, v1.16b, #1 ext v2.16b, v1.16b, v1.16b, #2 urhadd v4.16b, v1.16b, v3.16b uhadd v1.16b, v1.16b, v2.16b urhadd v0.16b, v1.16b, v3.16b zip1 v16.8b, v4.8b, v0.8b zip2 v17.8b, v4.8b, v0.8b ext v7.16b, v0.16b, v0.16b, #8 ext v0.8b, v17.8b, v7.8b, #6 ext v1.8b, v17.8b, v7.8b, #4 st1 {v0.8b}, [x0], x7 ext v2.8b, v17.8b, v7.8b, #2 st1 {v1.8b}, [x0], x7 st1 {v2.8b}, [x0], x7 ext v3.8b, v16.8b, v17.8b, #6 st1 {v17.8b}, [x0], x7 ext v4.8b, v16.8b, v17.8b, #4 st1 {v3.8b}, [x0], x7 ext v5.8b, v16.8b, v17.8b, #2 st1 {v4.8b}, [x0], x7 st1 {v5.8b}, [x0], x7 st1 {v16.8b}, [x0], x7 ret endfunc function predict_8x8_hu_neon, export=1 add x1, x1, #7 mov x7, #FDEC_STRIDE ld1 {v7.8b}, [x1] dup v6.8b, v7.b[0] rev64 v7.8b, v7.8b ext v4.8b, v7.8b, v6.8b, #2 ext v2.8b, v7.8b, v6.8b, #1 uhadd v5.8b, v7.8b, v4.8b urhadd v0.8b, v2.8b, v7.8b urhadd v1.8b, v5.8b, v2.8b zip1 v16.8b, v0.8b, v1.8b zip2 v17.8b, v0.8b, v1.8b dup v18.4h, v17.h[3] ext v0.8b, v16.8b, v17.8b, #2 ext v1.8b, v16.8b, v17.8b, #4 ext v2.8b, v16.8b, v17.8b, #6 st1 {v16.8b}, [x0], x7 st1 {v0.8b}, [x0], x7 st1 {v1.8b}, [x0], x7 st1 {v2.8b}, [x0], x7 ext v4.8b, v17.8b, v18.8b, #2 ext v5.8b, v17.8b, v18.8b, #4 ext v6.8b, v17.8b, v18.8b, #6 st1 {v17.8b}, [x0], x7 st1 {v4.8b}, [x0], x7 st1 {v5.8b}, [x0], x7 st1 {v6.8b}, [x0] ret endfunc function predict_8x8c_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE ld1 {v0.8b}, [x2] uaddlp v0.4h, v0.8b addp v0.4h, v0.4h, v0.4h rshrn v0.8b, v0.8h, #2 dup v3.8b, v0.b[1] dup v2.8b, v0.b[0] transpose v0.2s, v1.2s, v2.2s, v3.2s b pred8x8c_dc_end endfunc function predict_8x8c_dc_left_neon, export=1 ldurb w2, [x0, #0 * FDEC_STRIDE - 1] ldrb w3, [x0, #1 * FDEC_STRIDE - 1] ldrb w4, [x0, #2 * FDEC_STRIDE - 1] ldrb w5, [x0, #3 * FDEC_STRIDE - 1] mov x1, #FDEC_STRIDE add w2, w2, w3 add w3, w4, w5 ldrb w6, [x0, #4 * FDEC_STRIDE - 1] ldrb w7, [x0, #5 * FDEC_STRIDE - 1] ldrb w8, [x0, #6 * FDEC_STRIDE - 1] ldrb w9, [x0, #7 * FDEC_STRIDE - 1] add w6, w6, w7 add w7, w8, w9 add w2, w2, w3 add w6, w6, w7 dup v0.8h, w2 dup v1.8h, w6 rshrn v0.8b, v0.8h, #2 rshrn v1.8b, v1.8h, #2 b pred8x8c_dc_end endfunc function predict_8x8c_dc_neon, export=1 mov x1, #FDEC_STRIDE sub x2, x0, #FDEC_STRIDE ldurb w10, [x0, #0 * FDEC_STRIDE - 1] ldrb w11, [x0, #1 * FDEC_STRIDE - 1] ldrb w12, [x0, #2 * FDEC_STRIDE - 1] ldrb w13, [x0, #3 * FDEC_STRIDE - 1] add w10, w10, w11 ldrb w4, [x0, #4 * FDEC_STRIDE - 1] ldrb w5, [x0, #5 * FDEC_STRIDE - 1] add w12, w12, w13 ldrb w6, [x0, #6 * FDEC_STRIDE - 1] ldrb w7, [x0, #7 * FDEC_STRIDE - 1] add w4, w4, w5 add w6, w6, w7 add w10, w10, w12, lsl #16 add w4, w4, w6, lsl #16 ld1 {v0.8b}, [x2] add x10, x10, x4, lsl #32 uaddlp v0.4h, v0.8b // s0, s1 mov v1.d[0], x10 // s2, s3 add v3.4h, v0.4h, v1.4h addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3 addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3 uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3 uzp1 v1.2d, v1.2d, v1.2d uzp1 v0.2d, v0.2d, v0.2d rshrn v3.8b, v1.8h, #3 rshrn v2.8b, v0.8h, #2 uzp1 v0.8b, v3.8b, v2.8b uzp2 v1.8b, v2.8b, v3.8b pred8x8c_dc_end: add x2, x0, #2 * FDEC_STRIDE add x4, x0, #4 * FDEC_STRIDE add x5, x0, #6 * FDEC_STRIDE st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x2], x1 st1 {v0.8b}, [x0] st1 {v0.8b}, [x2] st1 {v1.8b}, [x4], x1 st1 {v1.8b}, [x5], x1 st1 {v1.8b}, [x4] st1 {v1.8b}, [x5] ret endfunc function predict_8x8c_h_neon, export=1 sub x1, x0, #1 mov x7, #FDEC_STRIDE .rept 4 ld1r {v0.8b}, [x1], x7 ld1r {v1.8b}, [x1], x7 st1 {v0.8b}, [x0], x7 st1 {v1.8b}, [x0], x7 .endr ret endfunc function predict_8x8c_v_aarch64, export=1 ldur x1, [x0, #-FDEC_STRIDE] .irp c, 0,1,2,3,4,5,6,7 str x1, [x0, #\c * FDEC_STRIDE] .endr ret endfunc function predict_8x8c_p_neon, export=1 sub x3, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE add x2, x3, #4 sub x3, x3, #1 ld1 {v0.s}[0], [x3] ld1 {v2.s}[0], [x2], x1 ldcol.8 v0, x3, x1, 4, hi=1 add x3, x3, x1 ldcol.8 v3, x3, x1, 4 movrel x4, p8weight movrel x5, p16weight uaddl v4.8h, v2.8b, v3.8b rev32 v0.8b, v0.8b trn1 v2.2s, v2.2s, v3.2s ld1 {v7.8h}, [x4] usubl v2.8h, v2.8b, v0.8b mul v2.8h, v2.8h, v7.8h ld1 {v0.8h}, [x5] saddlp v2.4s, v2.8h addp v2.4s, v2.4s, v2.4s shl v3.2s, v2.2s, #4 add v2.2s, v2.2s, v3.2s rshrn v5.4h, v2.4s, #5 // b, c, x, x addp v2.4h, v5.4h, v5.4h shl v3.4h, v2.4h, #2 sub v3.4h, v3.4h, v2.4h // 3 * (b + c) rev64 v4.4h, v4.4h add v4.4h, v4.4h, v0.4h shl v2.4h, v4.4h, #4 // a sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16 ext v0.16b, v0.16b, v0.16b, #14 sub v6.4h, v5.4h, v3.4h mov v0.h[0], wzr mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b dup v1.8h, v2.h[0] // pix dup v2.8h, v5.h[1] // c add v1.8h, v1.8h, v0.8h // pix + x*b mov x3, #8 1: subs x3, x3, #1 sqshrun v0.8b, v1.8h, #5 add v1.8h, v1.8h, v2.8h st1 {v0.8b}, [x0], x1 b.ne 1b ret endfunc .macro loadsum4 wd, t1, t2, t3, x, idx .if \idx == 0 ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1] .else ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1] .endif ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1] ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1] ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1] add \wd, \wd, \t1 add \t1, \t2, \t3 add \wd, \wd, \t1 .endm function predict_8x16c_h_neon, export=1 sub x2, x0, #1 add x3, x0, #FDEC_STRIDE - 1 mov x7, #2 * FDEC_STRIDE add x1, x0, #FDEC_STRIDE .rept 4 ld1r {v0.8b}, [x2], x7 ld1r {v1.8b}, [x3], x7 ld1r {v2.8b}, [x2], x7 ld1r {v3.8b}, [x3], x7 st1 {v0.8b}, [x0], x7 st1 {v1.8b}, [x1], x7 st1 {v2.8b}, [x0], x7 st1 {v3.8b}, [x1], x7 .endr ret endfunc function predict_8x16c_v_neon, export=1 sub x1, x0, #FDEC_STRIDE mov x2, #2 * FDEC_STRIDE ld1 {v0.8b}, [x1], x2 .rept 8 st1 {v0.8b}, [x0], x2 st1 {v0.8b}, [x1], x2 .endr ret endfunc function predict_8x16c_p_neon, export=1 movrel x4, p16weight ld1 {v17.8h}, [x4] sub x3, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE add x2, x3, #4 sub x3, x3, #1 ld1 {v0.8b}, [x3] ld1 {v2.8b}, [x2], x1 ldcol.8 v1, x3, x1 add x3, x3, x1 ldcol.8 v3, x3, x1 ext v4.8b, v2.8b, v2.8b, #3 ext v5.8b, v3.8b, v3.8b, #7 rev32 v0.8b, v0.8b rev64 v1.8b, v1.8b uaddl v4.8h, v5.8b, v4.8b // a * 1/16 usubl v2.8h, v2.8b, v0.8b mul v2.8h, v2.8h, v17.8h saddlp v2.4s, v2.8h addp v2.4s, v2.4s, v2.4s // H usubl v3.8h, v3.8b, v1.8b mul v3.8h, v3.8h, v17.8h saddlp v3.4s, v3.8h addp v3.4s, v3.4s, v3.4s addp v3.4s, v3.4s, v3.4s // V ext v17.16b, v17.16b, v17.16b, #14 shl v4.4h, v4.4h, #4 // a shl v6.2s, v2.2s, #4 // 16 * H shl v7.2s, v3.2s, #2 // 4 * V add v2.2s, v2.2s, v6.2s // 17 * H add v3.2s, v3.2s, v7.2s // 5 * V rshrn v2.4h, v2.4s, #5 // b rshrn v3.4h, v3.4s, #6 // c mov v17.h[0], wzr sub v4.4h, v4.4h, v2.4h // a - b shl v6.4h, v2.4h, #1 // 2 * b add v4.4h, v4.4h, v3.4h // a - b + c shl v7.4h, v3.4h, #3 // 8 * c sub v4.4h, v4.4h, v6.4h // a - 3b + c sub v4.4h, v4.4h, v7.4h // a - 3b - 7c mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b dup v1.8h, v4.h[0] // i00 dup v2.8h, v3.h[0] // c add v1.8h, v1.8h, v0.8h // pix + {0..7}*b mov x3, #16 1: subs x3, x3, #2 sqrshrun v4.8b, v1.8h, #5 add v1.8h, v1.8h, v2.8h sqrshrun v5.8b, v1.8h, #5 st1 {v4.8b}, [x0], x1 add v1.8h, v1.8h, v2.8h st1 {v5.8b}, [x0], x1 b.ne 1b ret endfunc function predict_8x16c_dc_neon, export=1 mov x1, #FDEC_STRIDE sub x10, x0, #FDEC_STRIDE loadsum4 w2, w3, w4, w5, x0, 0 ld1 {v6.8b}, [x10] loadsum4 w6, w7, w8, w9, x0, 4 uaddlp v6.4h, v6.8b dup v22.8h, w2 // s2 dup v23.8h, w6 // s3 loadsum4 w2, w3, w4, w5, x0, 8 addp v6.4h, v6.4h, v6.4h // s0, s1 loadsum4 w6, w7, w8, w9, x0, 12 dup v20.8h, v6.h[0] // s0 dup v21.8h, v6.h[1] // s1 dup v24.8h, w2 // s4 dup v25.8h, w6 // s5 ext v16.16b, v20.16b, v21.16b, #8 ext v17.16b, v22.16b, v21.16b, #8 ext v1.16b, v23.16b, v21.16b, #8 ext v2.16b, v24.16b, v21.16b, #8 ext v3.16b, v25.16b, v21.16b, #8 add v0.8h, v16.8h, v17.8h add v1.8h, v1.8h, v23.8h add v2.8h, v2.8h, v24.8h add v3.8h, v3.8h, v25.8h rshrn v0.8b, v0.8h, #3 rshrn v1.8b, v1.8h, #3 rshrn v2.8b, v2.8h, #3 rshrn v3.8b, v3.8h, #3 add x11, x0, #4 * FDEC_STRIDE add x12, x0, #8 * FDEC_STRIDE add x13, x0, #12 * FDEC_STRIDE .rept 4 st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x11], x1 st1 {v2.8b}, [x12], x1 st1 {v3.8b}, [x13], x1 .endr ret endfunc function predict_8x16c_dc_left_neon, export=1 mov x1, #FDEC_STRIDE ldurb w2, [x0, # 0 * FDEC_STRIDE - 1] ldrb w3, [x0, # 1 * FDEC_STRIDE - 1] ldrb w4, [x0, # 2 * FDEC_STRIDE - 1] ldrb w5, [x0, # 3 * FDEC_STRIDE - 1] add w2, w2, w3 ldrb w6, [x0, # 4 * FDEC_STRIDE - 1] add w4, w4, w5 ldrb w7, [x0, # 5 * FDEC_STRIDE - 1] add w2, w2, w4 ldrb w8, [x0, # 6 * FDEC_STRIDE - 1] ldrb w9, [x0, # 7 * FDEC_STRIDE - 1] dup v0.8h, w2 add w6, w6, w7 rshrn v0.8b, v0.8h, #2 add w8, w8, w9 ldrb w10, [x0, # 8 * FDEC_STRIDE - 1] ldrb w11, [x0, # 9 * FDEC_STRIDE - 1] add w6, w6, w8 ldrb w12, [x0, #10 * FDEC_STRIDE - 1] ldrb w13, [x0, #11 * FDEC_STRIDE - 1] dup v1.8h, w6 add w10, w10, w11 rshrn v1.8b, v1.8h, #2 add w12, w12, w13 ldrb w2, [x0, #12 * FDEC_STRIDE - 1] ldrb w3, [x0, #13 * FDEC_STRIDE - 1] add w10, w10, w12 ldrb w4, [x0, #14 * FDEC_STRIDE - 1] ldrb w5, [x0, #15 * FDEC_STRIDE - 1] dup v2.8h, w10 add w2, w2, w3 rshrn v2.8b, v2.8h, #2 add w4, w4, w5 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x0], x1 add w2, w2, w4 st1 {v0.8b}, [x0], x1 dup v3.8h, w2 st1 {v0.8b}, [x0], x1 rshrn v3.8b, v3.8h, #2 .irp idx, 1, 2, 3 .rept 4 st1 {v\idx\().8b}, [x0], x1 .endr .endr ret endfunc function predict_8x16c_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE ld1 {v0.8b}, [x2] uaddlp v0.4h, v0.8b addp v0.4h, v0.4h, v0.4h rshrn v4.8b, v0.8h, #2 dup v0.8b, v4.b[0] dup v1.8b, v4.b[1] ext v0.8b, v0.8b, v1.8b, #4 .rept 16 st1 {v0.8b}, [x0], x1 .endr ret endfunc function predict_16x16_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] b pred16x16_dc_end endfunc function predict_16x16_dc_left_neon, export=1 sub x2, x0, #1 mov x1, #FDEC_STRIDE ldcol.16 v0, x2, x1 uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] b pred16x16_dc_end endfunc function predict_16x16_dc_neon, export=1 sub x3, x0, #FDEC_STRIDE sub x2, x0, #1 mov x1, #FDEC_STRIDE ld1 {v0.16b}, [x3] ldcol.16 v1, x2, x1 uaddlv h0, v0.16b uaddlv h1, v1.16b add v0.4h, v0.4h, v1.4h rshrn v0.8b, v0.8h, #5 dup v0.16b, v0.b[0] pred16x16_dc_end: .rept 16 st1 {v0.16b}, [x0], x1 .endr ret endfunc function predict_16x16_h_neon, export=1 sub x1, x0, #1 mov x7, #FDEC_STRIDE .rept 8 ld1r {v0.16b}, [x1], x7 ld1r {v1.16b}, [x1], x7 st1 {v0.16b}, [x0], x7 st1 {v1.16b}, [x0], x7 .endr ret endfunc function predict_16x16_v_neon, export=1 sub x0, x0, #FDEC_STRIDE mov x7, #FDEC_STRIDE ld1 {v0.16b}, [x0], x7 .rept 16 st1 {v0.16b}, [x0], x7 .endr ret endfunc function predict_16x16_p_neon, export=1 sub x3, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE add x2, x3, #8 sub x3, x3, #1 ld1 {v0.8b}, [x3] ld1 {v2.8b}, [x2], x1 ldcol.8 v1, x3, x1 add x3, x3, x1 ldcol.8 v3, x3, x1 rev64 v0.8b, v0.8b rev64 v1.8b, v1.8b movrel x4, p16weight uaddl v4.8h, v2.8b, v3.8b ld1 {v7.8h}, [x4] usubl v2.8h, v2.8b, v0.8b usubl v3.8h, v3.8b, v1.8b mul v2.8h, v2.8h, v7.8h mul v3.8h, v3.8h, v7.8h saddlp v2.4s, v2.8h saddlp v3.4s, v3.8h addp v2.4s, v2.4s, v3.4s addp v2.4s, v2.4s, v2.4s shl v3.2s, v2.2s, #2 add v2.2s, v2.2s, v3.2s rshrn v5.4h, v2.4s, #6 // b, c, x, x addp v2.4h, v5.4h, v5.4h shl v3.4h, v2.4h, #3 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) ext v4.16b, v4.16b, v4.16b, #14 add v4.4h, v4.4h, v7.4h shl v2.4h, v4.4h, #4 // a sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16 ext v7.16b, v7.16b, v7.16b, #14 mov v7.h[0], wzr dup v3.8h, v5.h[0] mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b dup v1.8h, v2.h[0] // pix dup v2.8h, v5.h[1] // c shl v3.8h, v3.8h, #3 add v1.8h, v1.8h, v0.8h // pix + x*b add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b mov x3, #16 1: subs x3, x3, #1 sqshrun v0.8b, v1.8h, #5 add v1.8h, v1.8h, v2.8h sqshrun2 v0.16b, v3.8h, #5 add v3.8h, v3.8h, v2.8h st1 {v0.16b}, [x0], x1 b.ne 1b ret endfunc x264-master/common/aarch64/predict-c.c000066400000000000000000000101661502133446700176210ustar00rootroot00000000000000/***************************************************************************** * predict.c: aarch64 intra prediction ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "predict.h" #include "pixel.h" void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] ) { #if !HIGH_BIT_DEPTH if( cpu&X264_CPU_ARMV8 ) { pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64; pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64; } if( cpu&X264_CPU_NEON ) { pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon; pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon; } #endif // !HIGH_BIT_DEPTH } void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] ) { #if !HIGH_BIT_DEPTH if( cpu&X264_CPU_ARMV8 ) { pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_aarch64; } if( !(cpu&X264_CPU_NEON) ) return; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_neon; pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_neon; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon; pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_neon; pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon; pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; #endif // !HIGH_BIT_DEPTH } x264-master/common/aarch64/predict.h000066400000000000000000000146711502133446700174130ustar00rootroot00000000000000/***************************************************************************** * predict.h: aarch64 intra prediction ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_PREDICT_H #define X264_AARCH64_PREDICT_H #define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64) void x264_predict_4x4_h_aarch64( uint8_t *src ); #define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64) void x264_predict_4x4_v_aarch64( uint8_t *src ); #define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64) void x264_predict_8x8c_v_aarch64( uint8_t *src ); // for the merged 4x4 intra sad/satd which expects unified suffix #define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64 #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64 #define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64 #define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon) void x264_predict_4x4_dc_top_neon( uint8_t *src ); #define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon) void x264_predict_4x4_ddr_neon( uint8_t *src ); #define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon) void x264_predict_4x4_ddl_neon( uint8_t *src ); #define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon) void x264_predict_8x8c_dc_top_neon( uint8_t *src ); #define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon) void x264_predict_8x8c_dc_left_neon( uint8_t *src ); #define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon) void x264_predict_8x8c_p_neon( uint8_t *src ); #define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon) void x264_predict_8x16c_dc_left_neon( uint8_t *src ); #define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon) void x264_predict_8x16c_dc_top_neon( uint8_t *src ); #define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon) void x264_predict_8x16c_p_neon( uint8_t *src ); #define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon) void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon) void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon) void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon) void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon) void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon) void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon) void x264_predict_16x16_dc_top_neon( uint8_t *src ); #define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon) void x264_predict_16x16_dc_left_neon( uint8_t *src ); #define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon) void x264_predict_16x16_p_neon( uint8_t *src ); #define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon) void x264_predict_4x4_dc_neon( uint8_t *src ); #define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon) void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon) void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon) void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon) void x264_predict_8x8c_dc_neon( uint8_t *src ); #define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon) void x264_predict_8x8c_h_neon( uint8_t *src ); #define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon) void x264_predict_8x16c_v_neon( uint8_t *src ); #define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon) void x264_predict_8x16c_h_neon( uint8_t *src ); #define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon) void x264_predict_8x16c_dc_neon( uint8_t *src ); #define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon) void x264_predict_16x16_v_neon( uint8_t *src ); #define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon) void x264_predict_16x16_h_neon( uint8_t *src ); #define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon) void x264_predict_16x16_dc_neon( uint8_t *src ); #define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64) void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] ); #define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64) void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); #define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64) void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64) void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64) void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] ); #endif /* X264_AARCH64_PREDICT_H */ x264-master/common/aarch64/quant-a.S000066400000000000000000000765741502133446700173140ustar00rootroot00000000000000/**************************************************************************** * quant.S: arm quantization and level-run ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" // This is a common function for both 8 and 10 bit depth, since these two differ // at data loading only. The distinction is based on the depth parameters that //are passed to the macro. .macro decimate_score_1x size depth function decimate_score\size\()_neon, export=1 .if BIT_DEPTH == 8 ld1 {v0.8h,v1.8h}, [x0] movrel x5, X264(decimate_table4) movi v3.16b, #0x01 sqxtn v0.8b, v0.8h sqxtn2 v0.16b, v1.8h .else // BIT_DEPTH == 8 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] movrel x5, X264(decimate_table4) sqxtn v20.4h, v0.4s sqxtn2 v20.8h, v1.4s sqxtn v21.4h, v2.4s sqxtn2 v21.8h, v3.4s sqxtn v0.8b, v20.8h sqxtn2 v0.16b, v21.8h .endif // BIT_DEPTH == 8 movi v3.16b, #0x01 abs v2.16b, v0.16b cmeq v1.16b, v0.16b, #0 cmhi v2.16b, v2.16b, v3.16b shrn v1.8b, v1.8h, #4 shrn v2.8b, v2.8h, #4 fmov x2, d2 fmov x1, d1 cbnz x2, 9f mvn x1, x1 mov w0, #0 cbz x1, 0f .ifc \size, 15 lsr x1, x1, #1 .endif rbit x1, x1 1: clz x3, x1 lsr x6, x3, #2 lsl x1, x1, x3 ldrb w7, [x5, x6] lsl x1, x1, #4 add w0, w0, w7 cbnz x1, 1b ret 9: mov w0, #9 0: ret endfunc .endm const mask64, align=6 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 endconst .macro decimate_score64 depth function decimate_score64_neon, export=1 .if BIT_DEPTH == 8 ld1 {v0.8h, v1.8h}, [x0], #32 ld1 {v2.8h, v3.8h}, [x0], #32 ld1 {v4.8h, v5.8h}, [x0], #32 ld1 {v6.8h, v7.8h}, [x0] sqxtn v16.8b, v1.8h sqxtn2 v16.16b, v0.8h sqxtn v17.8b, v3.8h sqxtn2 v17.16b, v2.8h sqxtn v18.8b, v5.8h sqxtn2 v18.16b, v4.8h sqxtn v19.8b, v7.8h sqxtn2 v19.16b, v6.8h .else // BIT_DEPTH == 8 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0] sqxtn v28.4h, v0.4s sqxtn2 v28.8h, v1.4s sqxtn v0.4h, v2.4s sqxtn2 v0.8h, v3.4s sqxtn v2.4h, v6.4s sqxtn2 v2.8h, v7.4s sqxtn v3.4h, v4.4s sqxtn2 v3.8h, v5.4s sqxtn v4.4h, v22.4s sqxtn2 v4.8h, v23.4s sqxtn v5.4h, v20.4s sqxtn2 v5.8h, v21.4s sqxtn v6.4h, v26.4s sqxtn2 v6.8h, v27.4s sqxtn v7.4h, v24.4s sqxtn2 v7.8h, v25.4s sqxtn v16.8b, v0.8h sqxtn2 v16.16b, v28.8h sqxtn v17.8b, v2.8h sqxtn2 v17.16b, v3.8h sqxtn v18.8b, v4.8h sqxtn2 v18.16b, v5.8h sqxtn v19.8b, v6.8h sqxtn2 v19.16b, v7.8h .endif // BIT_DEPTH == 8 movrel x6, mask64 movi v31.16b, #0x01 abs v4.16b, v16.16b abs v5.16b, v17.16b abs v6.16b, v18.16b abs v7.16b, v19.16b ld1 {v30.16b}, [x6] cmeq v0.16b, v16.16b, #0 cmeq v1.16b, v17.16b, #0 cmeq v2.16b, v18.16b, #0 cmeq v3.16b, v19.16b, #0 umax v4.16b, v4.16b, v5.16b umax v6.16b, v6.16b, v7.16b and v0.16b, v0.16b, v30.16b and v1.16b, v1.16b, v30.16b and v2.16b, v2.16b, v30.16b and v3.16b, v3.16b, v30.16b umax v4.16b, v4.16b, v6.16b addp v0.16b, v1.16b, v0.16b addp v2.16b, v3.16b, v2.16b cmhi v4.16b, v4.16b, v31.16b addp v0.16b, v2.16b, v0.16b shrn v4.8b, v4.8h, #4 addp v0.16b, v0.16b, v0.16b fmov x2, d4 fmov x1, d0 cbnz x2, 9f mvn x1, x1 mov w0, #0 cbz x1, 0f movrel x5, X264(decimate_table8) 1: clz x3, x1 lsl x1, x1, x3 ldrb w7, [x5, x3] lsl x1, x1, #1 add w0, w0, w7 cbnz x1, 1b ret 9: mov w0, #9 0: ret endfunc .endm .macro COEFF_LAST_1x size, sub_factor function coeff_last\size\()_neon, export=1 .if \size == 15 sub x0, x0, \sub_factor .endif .if BIT_DEPTH == 8 ld1 {v0.8h, v1.8h}, [x0] uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h .else // BIT_DEPTH == 8 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] uqxtn v0.4h, v0.4s uqxtn2 v0.8h, v1.4s uqxtn v1.4h, v2.4s uqxtn2 v1.8h, v3.4s uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h .endif // BIT_DEPTH == 8 cmtst v0.16b, v0.16b, v0.16b shrn v0.8b, v0.8h, #4 fmov x1, d0 mov w3, #\size - 1 clz x2, x1 sub w0, w3, w2, lsr #2 ret endfunc .endm .macro COEFF_LAST64 function coeff_last64_neon, export=1 .if BIT_DEPTH == 8 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], 64 movi v31.8h, #8 movi v30.8h, #1 uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], 64 uqxtn v1.8b, v2.8h uqxtn2 v1.16b, v3.8h uqxtn v2.8b, v4.8h uqxtn2 v2.16b, v5.8h uqxtn v3.8b, v6.8h uqxtn2 v3.16b, v7.8h .else // BIT_DEPTH == 8 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 movi v31.8h, #8 movi v30.8h, #1 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64 uqxtn v0.4h, v0.4s uqxtn2 v0.8h, v1.4s uqxtn v1.4h, v2.4s uqxtn2 v1.8h, v3.4s uqxtn v2.4h, v4.4s uqxtn2 v2.8h, v5.4s ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 uqxtn v3.4h, v6.4s uqxtn2 v3.8h, v7.4s uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h uqxtn v1.8b, v2.8h uqxtn2 v1.16b, v3.8h ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 uqxtn v16.4h, v16.4s uqxtn2 v16.8h, v17.4s uqxtn v17.4h, v18.4s uqxtn2 v17.8h, v19.4s uqxtn v18.4h, v20.4s uqxtn2 v18.8h, v21.4s uqxtn v19.4h, v22.4s uqxtn2 v19.8h, v23.4s uqxtn v2.8b, v16.8h uqxtn2 v2.16b, v17.8h uqxtn v3.8b, v18.8h uqxtn2 v3.16b, v19.8h .endif // BIT_DEPTH == 8 cmtst v0.16b, v0.16b, v0.16b cmtst v1.16b, v1.16b, v1.16b cmtst v2.16b, v2.16b, v2.16b cmtst v3.16b, v3.16b, v3.16b shrn v0.8b, v0.8h, #4 shrn2 v0.16b, v1.8h, #4 shrn v1.8b, v2.8h, #4 shrn2 v1.16b, v3.8h, #4 clz v0.4s, v0.4s clz v1.4s, v1.4s shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 sub v0.8h, v31.8h, v0.8h sshl v0.8h, v30.8h, v0.8h shrn v0.8b, v0.8h, #1 fmov x2, d0 mov w3, #63 clz x2, x2 sub w0, w3, w2 ret endfunc .endm .macro coeff_level_run_start size, mask add x6, x1, #\mask // runlevel->mask mov w7, #0 mov w8, #0 mov w9, #1 mov w4, #\size - 1 .endm .macro coeff_level_run shift, depth clz x3, x2 subs w4, w4, w3, lsr #\shift str w4, [x1], #4 1: .ifc \depth, 8 ldrh w5, [x0, x4, lsl #1] strh w5, [x6], #2 .else lsl w5, w4, #2 ldr w5, [x0, x5] str w5, [x6], #4 .endif add w7, w7, #1 lsl w10, w9, w4 orr w8, w8, w10 b.le 2f add w3, w3, #1 << \shift sub w4, w4, #1 and x3, x3, #~((1 << \shift) - 1) lsl x2, x2, x3 clz x3, x2 subs w4, w4, w3, lsr #\shift b.ge 1b 2: str w8, [x1] mov w0, w7 .endm .if BIT_DEPTH == 8 .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask add v18.8h, v18.8h, \bias0 add v19.8h, v19.8h, \bias1 umull v20.4s, v18.4h, \mf0_1\().4h umull2 v21.4s, v18.8h, \mf0_1\().8h umull v22.4s, v19.4h, \mf2_3\().4h umull2 v23.4s, v19.8h, \mf2_3\().8h sshr v16.8h, v16.8h, #15 sshr v17.8h, v17.8h, #15 shrn v18.4h, v20.4s, #16 shrn2 v18.8h, v21.4s, #16 shrn v19.4h, v22.4s, #16 shrn2 v19.8h, v23.4s, #16 eor v18.16b, v18.16b, v16.16b eor v19.16b, v19.16b, v17.16b sub v18.8h, v18.8h, v16.8h sub v19.8h, v19.8h, v17.8h orr \mask, v18.16b, v19.16b st1 {v18.8h,v19.8h}, [x0], #32 .endm .macro QUANT_END d fmov x2, \d mov w0, #0 tst x2, x2 cinc w0, w0, ne ret .endm // quant_2x2_dc( int16_t dct[4], int mf, int bias ) function quant_2x2_dc_neon, export=1 ld1 {v0.4h}, [x0] dup v2.4h, w2 dup v1.4h, w1 abs v3.4h, v0.4h add v3.4h, v3.4h, v2.4h umull v3.4s, v3.4h, v1.4h sshr v0.4h, v0.4h, #15 shrn v3.4h, v3.4s, #16 eor v3.8b, v3.8b, v0.8b sub v3.4h, v3.4h, v0.4h st1 {v3.4h}, [x0] QUANT_END d3 endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) function quant_4x4_dc_neon, export=1 ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h dup v0.8h, w2 dup v2.8h, w1 QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b uqxtn v0.8b, v0.8h QUANT_END d0 endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function quant_4x4_neon, export=1 ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h ld1 {v0.8h,v1.8h}, [x2] ld1 {v2.8h,v3.8h}, [x1] QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b uqxtn v0.8b, v0.8h QUANT_END d0 endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) function quant_4x4x4_neon, export=1 ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h ld1 {v0.8h,v1.8h}, [x2] ld1 {v2.8h,v3.8h}, [x1] QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b uqxtn v4.8b, v4.8h uqxtn v7.8b, v7.8h uqxtn v6.8b, v6.8h uqxtn v5.8b, v5.8h fmov x7, d7 fmov x6, d6 fmov x5, d5 fmov x4, d4 mov w0, #0 tst x7, x7 cinc w0, w0, ne lsl w0, w0, #1 tst x6, x6 cinc w0, w0, ne lsl w0, w0, #1 tst x5, x5 cinc w0, w0, ne lsl w0, w0, #1 tst x4, x4 cinc w0, w0, ne ret endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function quant_8x8_neon, export=1 ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h ld1 {v0.8h,v1.8h}, [x2], #32 ld1 {v2.8h,v3.8h}, [x1], #32 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b .rept 3 ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h ld1 {v0.8h,v1.8h}, [x2], #32 ld1 {v2.8h,v3.8h}, [x1], #32 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b orr v4.16b, v4.16b, v5.16b .endr uqxtn v0.8b, v4.8h QUANT_END d0 endfunc .macro DEQUANT_START mf_size offset dc=no mov w3, #0x2b mul w3, w3, w2 lsr w3, w3, #8 // i_qbits = i_qp / 6 add w5, w3, w3, lsl #1 sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6 lsl w2, w2, #\mf_size .ifc \dc,no add x1, x1, w2, sxtw // dequant_mf[i_mf] .else ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0] .endif subs w3, w3, #\offset // 6 for 8x8 .endm // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) .macro DEQUANT size bits function dequant_\size\()_neon, export=1 DEQUANT_START \bits+2, \bits .ifc \size, 8x8 mov w2, #4 .endif b.lt dequant_\size\()_rshift dup v31.8h, w3 dequant_\size\()_lshift_loop: .ifc \size, 8x8 subs w2, w2, #1 .endif ld1 {v16.4s}, [x1], #16 ld1 {v17.4s}, [x1], #16 sqxtn v2.4h, v16.4s ld1 {v18.4s}, [x1], #16 sqxtn2 v2.8h, v17.4s ld1 {v19.4s}, [x1], #16 sqxtn v3.4h, v18.4s ld1 {v0.8h,v1.8h}, [x0] sqxtn2 v3.8h, v19.4s mul v0.8h, v0.8h, v2.8h mul v1.8h, v1.8h, v3.8h sshl v0.8h, v0.8h, v31.8h sshl v1.8h, v1.8h, v31.8h st1 {v0.8h,v1.8h}, [x0], #32 .ifc \size, 8x8 b.gt dequant_\size\()_lshift_loop .endif ret dequant_\size\()_rshift: dup v31.4s, w3 .ifc \size, 8x8 dequant_\size\()_rshift_loop: subs w2, w2, #1 .endif ld1 {v16.4s}, [x1], #16 ld1 {v17.4s}, [x1], #16 sqxtn v2.4h, v16.4s ld1 {v18.4s}, [x1], #16 sqxtn2 v2.8h, v17.4s ld1 {v19.4s}, [x1], #16 sqxtn v3.4h, v18.4s ld1 {v0.8h,v1.8h}, [x0] sqxtn2 v3.8h, v19.4s smull v16.4s, v0.4h, v2.4h smull2 v17.4s, v0.8h, v2.8h smull v18.4s, v1.4h, v3.4h smull2 v19.4s, v1.8h, v3.8h srshl v16.4s, v16.4s, v31.4s srshl v17.4s, v17.4s, v31.4s srshl v18.4s, v18.4s, v31.4s srshl v19.4s, v19.4s, v31.4s sqxtn v0.4h, v16.4s sqxtn2 v0.8h, v17.4s sqxtn v1.4h, v18.4s sqxtn2 v1.8h, v19.4s st1 {v0.8h,v1.8h}, [x0], #32 .ifc \size, 8x8 b.gt dequant_\size\()_rshift_loop .endif ret endfunc .endm DEQUANT 4x4, 4 DEQUANT 8x8, 6 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) function dequant_4x4_dc_neon, export=1 DEQUANT_START 6, 6, yes b.lt dequant_4x4_dc_rshift lsl w1, w1, w3 dup v2.8h, w1 ld1 {v0.8h,v1.8h}, [x0] mul v0.8h, v0.8h, v2.8h mul v1.8h, v1.8h, v2.8h st1 {v0.8h,v1.8h}, [x0] ret dequant_4x4_dc_rshift: dup v4.8h, w1 dup v3.4s, w3 ld1 {v0.8h,v1.8h}, [x0] smull v16.4s, v0.4h, v4.4h smull2 v17.4s, v0.8h, v4.8h smull v18.4s, v1.4h, v4.4h smull2 v19.4s, v1.8h, v4.8h srshl v16.4s, v16.4s, v3.4s srshl v17.4s, v17.4s, v3.4s srshl v18.4s, v18.4s, v3.4s srshl v19.4s, v19.4s, v3.4s sqxtn v0.4h, v16.4s sqxtn2 v0.8h, v17.4s sqxtn v1.4h, v18.4s sqxtn2 v1.8h, v19.4s st1 {v0.8h,v1.8h}, [x0] ret endfunc decimate_score_1x 15 decimate_score_1x 16 decimate_score64 // int coeff_last( int16_t *l ) function coeff_last4_aarch64, export=1 ldr x2, [x0] mov w4, #3 clz x0, x2 sub w0, w4, w0, lsr #4 ret endfunc function coeff_last8_aarch64, export=1 ldr x3, [x0, #8] mov w4, #7 clz x2, x3 cmp w2, #64 b.ne 1f ldr x3, [x0] sub w4, w4, #4 clz x2, x3 1: sub w0, w4, w2, lsr #4 ret endfunc COEFF_LAST_1x 15, #2 COEFF_LAST_1x 16, #2 COEFF_LAST64 function coeff_level_run4_aarch64, export=1 ldr x2, [x0] coeff_level_run_start 4, 23 and x6, x6, #~15 coeff_level_run 4, 8 ret endfunc .macro X264_COEFF_LEVEL_RUN size function coeff_level_run\size\()_neon, export=1 .if \size == 15 sub x0, x0, #2 .endif .if \size < 15 ld1 {v0.8h}, [x0] uqxtn v0.8b, v0.8h cmtst v0.8b, v0.8b, v0.8b .else ld1 {v0.8h,v1.8h}, [x0] uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h cmtst v0.16b, v0.16b, v0.16b shrn v0.8b, v0.8h, #4 .endif fmov x2, d0 .if \size == 15 add x0, x0, #2 .endif coeff_level_run_start \size, 23 and x6, x6, #~15 coeff_level_run (4 - (\size + 1) / 8), 8 ret endfunc .endm X264_COEFF_LEVEL_RUN 8 X264_COEFF_LEVEL_RUN 15 X264_COEFF_LEVEL_RUN 16 function denoise_dct_neon, export=1 1: subs w3, w3, #16 ld1 {v0.8h,v1.8h}, [x0] ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1] abs v16.8h, v0.8h abs v17.8h, v1.8h ld1 {v2.8h,v3.8h}, [x2], #32 cmlt v18.8h, v0.8h, #0 cmlt v19.8h, v1.8h, #0 uaddw v4.4s, v4.4s, v16.4h uaddw2 v5.4s, v5.4s, v16.8h uqsub v20.8h, v16.8h, v2.8h uqsub v21.8h, v17.8h, v3.8h uaddw v6.4s, v6.4s, v17.4h uaddw2 v7.4s, v7.4s, v17.8h neg v22.8h, v20.8h neg v23.8h, v21.8h bsl v18.16b, v22.16b, v20.16b bsl v19.16b, v23.16b, v21.16b st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64 st1 {v18.8h,v19.8h}, [x0], #32 b.gt 1b ret endfunc .else // BIT_DEPTH == 8 .macro QUANT_TWO mask add v20.4s, v20.4s, v0.4s add v21.4s, v21.4s, v1.4s add v22.4s, v22.4s, v2.4s add v23.4s, v23.4s, v3.4s mul v24.4s, v20.4s, v4.4s mul v25.4s, v21.4s, v5.4s mul v26.4s, v22.4s, v6.4s mul v27.4s, v23.4s, v7.4s sshr v16.4s, v16.4s, #31 sshr v17.4s, v17.4s, #31 sshr v18.4s, v18.4s, #31 sshr v19.4s, v19.4s, #31 sshr v20.4s, v24.4s, #16 sshr v21.4s, v25.4s, #16 sshr v22.4s, v26.4s, #16 sshr v23.4s, v27.4s, #16 eor v20.16b, v20.16b, v16.16b eor v21.16b, v21.16b, v17.16b eor v22.16b, v22.16b, v18.16b eor v23.16b, v23.16b, v19.16b sub v20.4s, v20.4s, v16.4s sub v21.4s, v21.4s, v17.4s sub v22.4s, v22.4s, v18.4s sub v23.4s, v23.4s, v19.4s orr \mask, v20.16b, v21.16b orr v16.16b, v22.16b, v23.16b orr \mask, \mask, v16.16b st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 .endm .macro QUANT_END d // Use parameter d as a register number and extract upper and lower halves. fmov x2, d\d fmov x3, v\d\().d[1] orr x2, x2, x3 mov w0, #0 tst x2, x2 cinc w0, w0, ne ret .endm // quant_2x2_dc( dctcoef dct[4], int mf, int bias ) function quant_2x2_dc_neon, export=1 ld1 {v0.4s}, [x0] dup v2.4s, w2 dup v1.4s, w1 abs v3.4s, v0.4s add v3.4s, v3.4s, v2.4s mul v3.4s, v3.4s, v1.4s sshr v0.4s, v0.4s, #31 sshr v3.4s, v3.4s, #16 eor v3.16b, v3.16b, v0.16b sub v0.4s, v3.4s, v0.4s st1 {v0.4s}, [x0] QUANT_END 0 endfunc // quant_4x4_dc( dctcoef dct[16], int mf, int bias ) function quant_4x4_dc_neon, export=1 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s dup v0.4s, w2 dup v1.4s, w2 dup v2.4s, w2 dup v3.4s, w2 dup v4.4s, w1 dup v5.4s, w1 dup v6.4s, w1 dup v7.4s, w1 QUANT_TWO v0.16b QUANT_END 0 endfunc // quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ) function quant_4x4_neon, export=1 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1] QUANT_TWO v0.16b QUANT_END 0 endfunc // quant_4x4x4( dctcoef dct[4][16], uint32_t mf[16], uint32_t bias[16] ) function quant_4x4x4_neon, export=1 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s QUANT_TWO v28.16b ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s QUANT_TWO v29.16b ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s QUANT_TWO v30.16b ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s QUANT_TWO v31.16b uqxtn v28.4h, v28.4s uqxtn v29.4h, v29.4s uqxtn v30.4h, v30.4s uqxtn v31.4h, v31.4s fmov x7, d28 fmov x6, d29 fmov x10, d30 fmov x12, d31 mov w0, #0 tst x12, x12 cinc w0, w0, ne lsl w0, w0, #1 tst x10, x10 cinc w0, w0, ne lsl w0, w0, #1 tst x6, x6 cinc w0, w0, ne lsl w0, w0, #1 tst x7, x7 cinc w0, w0, ne ret endfunc // quant_8x8( dctcoef dct[64], uint32_t mf[64], uint32_t bias[64] ) function quant_8x8_neon, export=1 ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 QUANT_TWO v28.16b ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 QUANT_TWO v29.16b ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 QUANT_TWO v30.16b ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] abs v20.4s, v16.4s abs v21.4s, v17.4s abs v22.4s, v18.4s abs v23.4s, v19.4s ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 QUANT_TWO v31.16b orr v0.16b, v28.16b, v29.16b orr v0.16b, v0.16b, v30.16b orr v0.16b, v0.16b, v31.16b QUANT_END 0 endfunc .macro DEQUANT_START mf_size offset dc=no mov w3, #0x2b mul w3, w3, w2 lsr w3, w3, #8 // i_qbits = i_qp / 6 add w5, w3, w3, lsl #1 sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6 lsl w2, w2, #\mf_size .ifc \dc,no add x1, x1, w2, sxtw // dequant_mf[i_mf] .else ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0] .endif subs w3, w3, #\offset // 6 for 8x8 .endm // dequant_4x4( int32_t dct[16], int dequant_mf[6][16], int i_qp ) .macro DEQUANT size bits function dequant_\size\()_neon, export=1 DEQUANT_START \bits+2, \bits .ifc \size, 8x8 mov w2, #4 .endif b.lt dequant_\size\()_rshift dup v31.4s, w3 dequant_\size\()_lshift_loop: .ifc \size, 8x8 subs w2, w2, #1 .endif ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] mul v0.4s, v0.4s, v16.4s mul v1.4s, v1.4s, v17.4s mul v2.4s, v2.4s, v18.4s mul v3.4s, v3.4s, v19.4s sshl v0.4s, v0.4s, v31.4s sshl v1.4s, v1.4s, v31.4s sshl v2.4s, v2.4s, v31.4s sshl v3.4s, v3.4s, v31.4s st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 .ifc \size, 8x8 b.gt dequant_\size\()_lshift_loop .endif ret dequant_\size\()_rshift: dup v31.4s, w3 .ifc \size, 8x8 dequant_\size\()_rshift_loop: subs w2, w2, #1 .endif ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] mul v20.4s, v0.4s, v16.4s mul v21.4s, v1.4s, v17.4s mul v22.4s, v2.4s, v18.4s mul v23.4s, v3.4s, v19.4s srshl v16.4s, v20.4s, v31.4s srshl v17.4s, v21.4s, v31.4s srshl v18.4s, v22.4s, v31.4s srshl v19.4s, v23.4s, v31.4s st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 .ifc \size, 8x8 b.gt dequant_\size\()_rshift_loop .endif ret endfunc .endm DEQUANT 4x4, 4 DEQUANT 8x8, 6 // dequant_4x4_dc( int32_t dct[16], int dequant_mf[6][16], int i_qp ) function dequant_4x4_dc_neon, export=1 DEQUANT_START 6, 6, yes b.lt dequant_4x4_dc_rshift lsl w1, w1, w3 dup v31.4s, w1 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] mul v0.4s, v0.4s, v31.4s mul v1.4s, v1.4s, v31.4s mul v2.4s, v2.4s, v31.4s mul v3.4s, v3.4s, v31.4s st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] ret dequant_4x4_dc_rshift: dup v31.4s, w1 dup v30.4s, w3 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] mul v16.4s, v0.4s, v31.4s mul v17.4s, v1.4s, v31.4s mul v18.4s, v2.4s, v31.4s mul v19.4s, v3.4s, v31.4s srshl v16.4s, v16.4s, v30.4s srshl v17.4s, v17.4s, v30.4s srshl v18.4s, v18.4s, v30.4s srshl v19.4s, v19.4s, v30.4s st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] ret endfunc decimate_score_1x 15 decimate_score_1x 16 decimate_score64 // int coeff_last( int32_t *l ) function coeff_last4_neon, export=1 ld1 {v0.4s}, [x0] uqxtn v0.4h, v0.4s uqxtn v0.8b, v0.8h mov w4, #3 cmtst v0.16b, v0.16b, v0.16b fmov w1, s0 clz w2, w1 sub w0, w4, w2, lsr #3 ret endfunc function coeff_last8_neon, export=1 ld1 {v0.4s, v1.4s}, [x0] uqxtn v0.4h, v0.4s uqxtn2 v0.8h, v1.4s uqxtn v0.8b, v0.8h mov w4, #7 cmtst v0.16b, v0.16b, v0.16b fmov x1, d0 clz x2, x1 sub x0, x4, x2, lsr #3 ret endfunc COEFF_LAST_1x 15, #4 COEFF_LAST_1x 16, #4 COEFF_LAST64 function coeff_level_run4_neon, export=1 ldr x2, [x0] ld1 {v0.4s}, [x0] uqxtn v0.4h, v0.4s uqxtn v0.8b, v0.8h fmov x2, d0 coeff_level_run_start 8, 16 coeff_level_run 3, 10 ret endfunc .macro X264_COEFF_LEVEL_RUN size function coeff_level_run\size\()_neon, export=1 .if \size == 15 sub x0, x0, #4 .endif .if \size < 15 ld1 {v0.4s, v1.4s}, [x0] uqxtn v0.4h, v0.4s uqxtn2 v0.8h, v1.4s uqxtn v0.8b, v0.8h cmtst v0.8b, v0.8b, v0.8b .else ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] uqxtn v0.4h, v0.4s uqxtn2 v0.8h, v1.4s uqxtn v1.4h, v2.4s uqxtn2 v1.8h, v3.4s uqxtn v0.8b, v0.8h uqxtn2 v0.16b, v1.8h cmtst v0.16b, v0.16b, v0.16b shrn v0.8b, v0.8h, #4 .endif fmov x2, d0 .if \size == 15 add x0, x0, #4 .endif coeff_level_run_start \size, 16 coeff_level_run (4 - (\size + 1) / 8), 10 ret endfunc .endm X264_COEFF_LEVEL_RUN 8 X264_COEFF_LEVEL_RUN 15 X264_COEFF_LEVEL_RUN 16 function denoise_dct_neon, export=1 1: subs w3, w3, #16 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1] abs v16.4s, v0.4s abs v17.4s, v1.4s abs v18.4s, v2.4s abs v19.4s, v3.4s cmlt v24.4s, v0.4s, #0 cmlt v25.4s, v1.4s, #0 cmlt v26.4s, v2.4s, #0 cmlt v27.4s, v3.4s, #0 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64 add v4.4s, v4.4s, v16.4s add v5.4s, v5.4s, v17.4s sub v28.4s, v16.4s, v20.4s sub v29.4s, v17.4s, v21.4s sub v30.4s, v18.4s, v22.4s sub v31.4s, v19.4s, v23.4s add v6.4s, v6.4s, v18.4s add v7.4s, v7.4s, v19.4s cmlt v20.4s, v28.4s, #0 cmlt v21.4s, v29.4s, #0 cmlt v22.4s, v30.4s, #0 cmlt v23.4s, v31.4s, #0 movi v0.4s, #0 bsl v20.16b, v0.16b, v28.16b bsl v21.16b, v0.16b, v29.16b bsl v22.16b, v0.16b, v30.16b bsl v23.16b, v0.16b, v31.16b neg v0.4s, v20.4s neg v1.4s, v21.4s neg v2.4s, v22.4s neg v3.4s, v23.4s bsl v24.16b, v0.16b, v20.16b bsl v25.16b, v1.16b, v21.16b bsl v26.16b, v2.16b, v22.16b bsl v27.16b, v3.16b, v23.16b st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 b.gt 1b ret endfunc .endif x264-master/common/aarch64/quant.h000066400000000000000000000107641502133446700171100ustar00rootroot00000000000000/***************************************************************************** * quant.h: arm quantization and level-run ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_AARCH64_QUANT_H #define X264_AARCH64_QUANT_H #define x264_quant_2x2_dc_aarch64 x264_template(quant_2x2_dc_aarch64) int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias ); #define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon) int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias ); #define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon) int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_neon x264_template(quant_4x4_neon) int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon) int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_8x8_neon x264_template(quant_8x8_neon) int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon) void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4_neon x264_template(dequant_4x4_neon) void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_neon x264_template(dequant_8x8_neon) void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_decimate_score15_neon x264_template(decimate_score15_neon) int x264_decimate_score15_neon( dctcoef * ); #define x264_decimate_score16_neon x264_template(decimate_score16_neon) int x264_decimate_score16_neon( dctcoef * ); #define x264_decimate_score64_neon x264_template(decimate_score64_neon) int x264_decimate_score64_neon( dctcoef * ); // BIT DEPTH = 8 #define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64) int x264_coeff_last4_aarch64( dctcoef * ); #define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64) int x264_coeff_last8_aarch64( dctcoef * ); // BIT DEPTH = 10 #define x264_coeff_last4_neon x264_template(coeff_last4_neon) int x264_coeff_last4_neon( dctcoef * ); #define x264_coeff_last8_neon x264_template(coeff_last8_neon) int x264_coeff_last8_neon( dctcoef * ); #define x264_coeff_last15_neon x264_template(coeff_last15_neon) int x264_coeff_last15_neon( dctcoef * ); #define x264_coeff_last16_neon x264_template(coeff_last16_neon) int x264_coeff_last16_neon( dctcoef * ); #define x264_coeff_last64_neon x264_template(coeff_last64_neon) int x264_coeff_last64_neon( dctcoef * ); // BIT_DEPTH = 8 #define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64) int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * ); // BIT_DEPTH = 10 #define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon) int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon) int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon) int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon) int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * ); #define x264_denoise_dct_neon x264_template(denoise_dct_neon) void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int ); #endif x264-master/common/arm/000077500000000000000000000000001502133446700151265ustar00rootroot00000000000000x264-master/common/arm/asm.S000066400000000000000000000133761502133446700160440ustar00rootroot00000000000000/***************************************************************************** * asm.S: arm utility macros ***************************************************************************** * Copyright (C) 2008-2025 x264 project * * Authors: Mans Rullgard * David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "config.h" .syntax unified #ifdef __ELF__ .arch armv7-a .fpu neon #endif #define GLUE(a, b) a ## b #define JOIN(a, b) GLUE(a, b) #ifdef PREFIX # define BASE _x264_ # define SYM_PREFIX _ #else # define BASE x264_ # define SYM_PREFIX #endif #ifdef BIT_DEPTH # define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _) #else # define EXTERN_ASM BASE #endif #define X(s) JOIN(EXTERN_ASM, s) #define X264(s) JOIN(BASE, s) #define EXT(s) JOIN(SYM_PREFIX, s) #ifdef __ELF__ # define ELF #else # define ELF @ #endif #ifdef __MACH__ # define MACH # define NONMACH @ #else # define MACH @ # define NONMACH #endif #if HAVE_AS_FUNC # define FUNC #else # define FUNC @ #endif #if SYS_LINUX || SYS_OPENBSD #define HAVE_SECTION_DATA_REL_RO 1 #else #define HAVE_SECTION_DATA_REL_RO 0 #endif .macro require8, val=1 ELF .eabi_attribute 24, \val .endm .macro preserve8, val=1 ELF .eabi_attribute 25, \val .endm .macro function name, export=1 .macro endfunc .if \export ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name .else ELF .size \name, . - \name .endif FUNC .endfunc .purgem endfunc .endm .text .align 2 .if \export == 1 .global EXTERN_ASM\name ELF .hidden EXTERN_ASM\name ELF .type EXTERN_ASM\name, %function FUNC .func EXTERN_ASM\name EXTERN_ASM\name: .else ELF .hidden \name ELF .type \name, %function FUNC .func \name \name: .endif .endm .macro const name, align=2, relocate=0 .macro endconst ELF .size \name, . - \name .purgem endconst .endm .if HAVE_SECTION_DATA_REL_RO && \relocate .section .data.rel.ro .else NONMACH .section .rodata MACH .const_data .endif .align \align \name: .endm .macro movrel rd, val #if defined(PIC) ldr \rd, 1f b 2f 1: @ FIXME: thumb .word \val - (2f + 8) 2: add \rd, \rd, pc #elif HAVE_ARMV6T2 movw \rd, #:lower16:\val movt \rd, #:upper16:\val #else ldr \rd, =\val #endif .endm .macro movrelx rd, val, got #if defined(PIC) && defined(__ELF__) ldr \got, 2f ldr \rd, 1f b 3f 1: @ FIXME: thumb .word \val(GOT) 2: .word _GLOBAL_OFFSET_TABLE_ - (3f + 8) 3: add \got, \got, pc ldr \rd, [\got, \rd] #elif defined(PIC) && defined(__APPLE__) ldr \rd, 1f b 2f 1: @ FIXME: thumb .word 3f - (2f + 8) 2: ldr \rd, [pc, \rd] .non_lazy_symbol_pointer 3: .indirect_symbol \val .word 0 .text #else movrel \rd, \val #endif .endm .macro movconst rd, val #if HAVE_ARMV6T2 movw \rd, #:lower16:\val .if \val >> 16 movt \rd, #:upper16:\val .endif #else ldr \rd, =\val #endif .endm #define FENC_STRIDE 16 #define FDEC_STRIDE 32 .macro HORIZ_ADD dest, a, b .ifnb \b vadd.u16 \a, \a, \b .endif vpaddl.u16 \a, \a vpaddl.u32 \dest, \a .endm .macro SUMSUB_AB sum, diff, a, b vadd.s16 \sum, \a, \b vsub.s16 \diff, \a, \b .endm .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d SUMSUB_AB \s1, \d1, \a, \b SUMSUB_AB \s2, \d2, \c, \d .endm .macro ABS2 a b vabs.s16 \a, \a vabs.s16 \b, \b .endm // dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes) // op = sumsub/amax (sum and diff / maximum of absolutes) // d1/2 = destination registers // s1/2 = source registers .macro HADAMARD dist, op, d1, d2, s1, s2 .if \dist == 1 vtrn.16 \s1, \s2 .else vtrn.32 \s1, \s2 .endif .ifc \op, sumsub SUMSUB_AB \d1, \d2, \s1, \s2 .else vabs.s16 \s1, \s1 vabs.s16 \s2, \s2 vmax.s16 \d1, \s1, \s2 .endif .endm .macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7 vtrn.32 \r0, \r4 vtrn.32 \r1, \r5 vtrn.32 \r2, \r6 vtrn.32 \r3, \r7 vtrn.16 \r0, \r2 vtrn.16 \r1, \r3 vtrn.16 \r4, \r6 vtrn.16 \r5, \r7 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 vtrn.8 \r4, \r5 vtrn.8 \r6, \r7 .endm .macro TRANSPOSE4x4 r0 r1 r2 r3 vtrn.16 \r0, \r2 vtrn.16 \r1, \r3 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 .endm .macro TRANSPOSE4x4_16 d0 d1 d2 d3 vtrn.32 \d0, \d2 vtrn.32 \d1, \d3 vtrn.16 \d0, \d1 vtrn.16 \d2, \d3 .endm x264-master/common/arm/bitstream-a.S000066400000000000000000000050461502133446700174670ustar00rootroot00000000000000/***************************************************************************** * bitstream-a.S: arm bitstream functions ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" function nal_escape_neon push {r4-r5,lr} vmov.u8 q0, #0xff vmov.u8 q8, #4 mov r3, #3 subs lr, r1, r2 beq 99f 0: cmn lr, #15 blt 16f mov r1, r2 b 100f 16: vld1.8 {q1}, [r1]! vext.8 q2, q0, q1, #14 vext.8 q3, q0, q1, #15 vcgt.u8 q11, q8, q1 vceq.u8 q9, q2, #0 vceq.u8 q10, q3, #0 vand q9, q9, q11 vand q9, q9, q10 vshrn.u16 d22, q9, #4 vmov ip, lr, d22 orrs ip, ip, lr beq 16f mov lr, #-16 100: vmov.u8 r5, d1[6] vmov.u8 r4, d1[7] orr r5, r4, r5, lsl #8 101: ldrb r4, [r1, lr] orr ip, r4, r5, lsl #16 cmp ip, #3 bhi 102f strb r3, [r0], #1 orr r5, r3, r5, lsl #8 102: adds lr, lr, #1 strb r4, [r0], #1 orr r5, r4, r5, lsl #8 blt 101b subs lr, r1, r2 lsr ip, r5, #8 vmov.u8 d1[6], ip vmov.u8 d1[7], r5 blt 0b pop {r4-r5,pc} 16: subs lr, r1, r2 vst1.8 {q1}, [r0]! vmov q0, q1 blt 0b 99: pop {r4-r5,pc} endfunc x264-master/common/arm/bitstream.h000066400000000000000000000026231502133446700172740ustar00rootroot00000000000000/***************************************************************************** * bitstream.h: arm bitstream functions ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_BITSTREAM_H #define X264_ARM_BITSTREAM_H #define x264_nal_escape_neon x264_template(nal_escape_neon) uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end ); #endif x264-master/common/arm/cpu-a.S000066400000000000000000000066441502133446700162710ustar00rootroot00000000000000/***************************************************************************** * cpu-a.S: arm cpu detection ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" .align 2 // done in gas because .fpu neon overrides the refusal to assemble // instructions the selected -march/-mcpu doesn't support function cpu_neon_test vadd.i16 q0, q0, q0 bx lr endfunc // return: 0 on success // 1 if counters were already enabled // 9 if lo-res counters were already enabled function cpu_enable_armv7_counter, export=0 mrc p15, 0, r2, c9, c12, 0 // read PMNC ands r0, r2, #1 andne r0, r2, #9 orr r2, r2, #1 // enable counters bic r2, r2, #8 // full resolution mcreq p15, 0, r2, c9, c12, 0 // write PMNC mov r2, #1 << 31 // enable cycle counter mcr p15, 0, r2, c9, c12, 1 // write CNTENS bx lr endfunc function cpu_disable_armv7_counter, export=0 mrc p15, 0, r0, c9, c12, 0 // read PMNC bic r0, r0, #1 // disable counters mcr p15, 0, r0, c9, c12, 0 // write PMNC bx lr endfunc .macro READ_TIME r mrc p15, 0, \r, c9, c13, 0 .endm // return: 0 if transfers neon -> arm transfers take more than 10 cycles // nonzero otherwise function cpu_fast_neon_mrc_test // check for user access to performance counters mrc p15, 0, r0, c9, c14, 0 cmp r0, #0 bxeq lr push {r4-r6,lr} bl cpu_enable_armv7_counter ands r1, r0, #8 mov r3, #0 mov ip, #4 mov r6, #4 moveq r5, #1 movne r5, #64 average_loop: mov r4, r5 READ_TIME r1 1: subs r4, r4, #1 .rept 8 vmov.u32 lr, d0[0] add lr, lr, lr .endr bgt 1b READ_TIME r2 subs r6, r6, #1 sub r2, r2, r1 cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles addle r3, r3, r2 subsle ip, ip, #1 bgt average_loop // disable counters if we enabled them ands r0, r0, #1 bleq cpu_disable_armv7_counter lsr r0, r3, #5 cmp r0, #10 movgt r0, #0 pop {r4-r6,pc} endfunc x264-master/common/arm/dct-a.S000066400000000000000000000571621502133446700162550ustar00rootroot00000000000000/**************************************************************************** * dct-a.S: arm transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" const scan4x4_frame, align=4 .byte 0,1, 8,9, 2,3, 4,5 .byte 2,3, 8,9, 16,17, 10,11 .byte 12,13, 6,7, 14,15, 20,21 .byte 10,11, 12,13, 6,7, 14,15 endconst .text // sum = a + (b>>shift) sub = (a>>shift) - b .macro SUMSUB_SHR shift sum sub a b t0 t1 vshr.s16 \t0, \b, #\shift vshr.s16 \t1, \a, #\shift vadd.s16 \sum, \a, \t0 vsub.s16 \sub, \t1, \b .endm // sum = (a>>shift) + b sub = a - (b>>shift) .macro SUMSUB_SHR2 shift sum sub a b t0 t1 vshr.s16 \t0, \a, #\shift vshr.s16 \t1, \b, #\shift vadd.s16 \sum, \t0, \b vsub.s16 \sub, \a, \t1 .endm // a += 1.5*ma b -= 1.5*mb .macro SUMSUB_15 a b ma mb t0 t1 vshr.s16 \t0, \ma, #1 vshr.s16 \t1, \mb, #1 vadd.s16 \t0, \t0, \ma vadd.s16 \t1, \t1, \mb vadd.s16 \a, \a, \t0 vsub.s16 \b, \b, \t1 .endm function dct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3 SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7 vmov.s16 d31, #1 HADAMARD 1, sumsub, q2, q3, q0, q1 vtrn.32 d4, d5 vadd.s16 d16, d4, d31 vtrn.32 d6, d7 vadd.s16 d17, d6, d31 vrhadd.s16 d0, d4, d5 vhsub.s16 d1, d16, d5 vhsub.s16 d2, d17, d7 vrhadd.s16 d3, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr endfunc function idct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3 SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7 HADAMARD 1, sumsub, q2, q3, q0, q1 HADAMARD 2, sumsub, d0, d1, d4, d5 HADAMARD 2, sumsub, d3, d2, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr endfunc .macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7 SUMSUB_AB \d1, \d6, \d5, \d6 SUMSUB_AB \d3, \d7, \d4, \d7 vadd.s16 \d0, \d3, \d1 vadd.s16 \d4, \d7, \d7 vadd.s16 \d5, \d6, \d6 vsub.s16 \d2, \d3, \d1 vadd.s16 \d1, \d4, \d6 vsub.s16 \d3, \d7, \d5 .endm function sub4x4_dct_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.32 {d0[]}, [r1,:32], r3 vld1.32 {d1[]}, [r2,:32], ip vld1.32 {d2[]}, [r1,:32], r3 vsubl.u8 q8, d0, d1 vld1.32 {d3[]}, [r2,:32], ip vld1.32 {d4[]}, [r1,:32], r3 vsubl.u8 q9, d2, d3 vld1.32 {d5[]}, [r2,:32], ip vld1.32 {d6[]}, [r1,:32], r3 vsubl.u8 q10, d4, d5 vld1.32 {d7[]}, [r2,:32], ip vsubl.u8 q11, d6, d7 DCT_1D d0, d1, d2, d3, d16, d18, d20, d22 TRANSPOSE4x4_16 d0, d1, d2, d3 DCT_1D d4, d5, d6, d7, d0, d1, d2, d3 vst1.64 {d4-d7}, [r0,:128] bx lr endfunc function sub8x4_dct_neon, export=0 vld1.64 {d0}, [r1,:64], r3 vld1.64 {d1}, [r2,:64], ip vsubl.u8 q8, d0, d1 vld1.64 {d2}, [r1,:64], r3 vld1.64 {d3}, [r2,:64], ip vsubl.u8 q9, d2, d3 vld1.64 {d4}, [r1,:64], r3 vld1.64 {d5}, [r2,:64], ip vsubl.u8 q10, d4, d5 vld1.64 {d6}, [r1,:64], r3 vld1.64 {d7}, [r2,:64], ip vsubl.u8 q11, d6, d7 DCT_1D q0, q1, q2, q3, q8, q9, q10, q11 TRANSPOSE4x4_16 q0, q1, q2, q3 SUMSUB_AB q8, q12, q0, q3 SUMSUB_AB q9, q10, q1, q2 vadd.i16 q13, q12, q12 vadd.i16 q11, q10, q10 vadd.i16 d0, d16, d18 vadd.i16 d1, d26, d20 vsub.i16 d2, d16, d18 vsub.i16 d3, d24, d22 vst1.64 {d0-d1}, [r0,:128]! vadd.i16 d4, d17, d19 vadd.i16 d5, d27, d21 vst1.64 {d2-d3}, [r0,:128]! vsub.i16 d6, d17, d19 vsub.i16 d7, d25, d23 vst1.64 {d4-d5}, [r0,:128]! vst1.64 {d6-d7}, [r0,:128]! bx lr endfunc function sub8x8_dct_neon push {lr} mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE bl sub8x4_dct_neon pop {lr} b sub8x4_dct_neon endfunc function sub16x16_dct_neon push {lr} mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE bl sub8x4_dct_neon bl sub8x4_dct_neon sub r1, r1, #8*FENC_STRIDE-8 sub r2, r2, #8*FDEC_STRIDE-8 bl sub8x4_dct_neon bl sub8x4_dct_neon sub r1, r1, #8 sub r2, r2, #8 bl sub8x4_dct_neon bl sub8x4_dct_neon sub r1, r1, #8*FENC_STRIDE-8 sub r2, r2, #8*FDEC_STRIDE-8 bl sub8x4_dct_neon pop {lr} b sub8x4_dct_neon endfunc .macro DCT8_1D type SUMSUB_AB q2, q1, q11, q12 // s34/d34 SUMSUB_AB q3, q11, q10, q13 // s25/d25 SUMSUB_AB q13, q10, q9, q14 // s16/d16 SUMSUB_AB q14, q8, q8, q15 // s07/d07 SUMSUB_AB q9, q2, q14, q2 // a0/a2 SUMSUB_AB q12, q14, q13, q3 // a1/a3 SUMSUB_AB q3, q13, q8, q1 // a6/a5 vshr.s16 q0, q10, #1 vshr.s16 q15, q11, #1 vadd.s16 q0, q0, q10 vadd.s16 q15, q15, q11 vsub.s16 q3, q3, q0 vsub.s16 q13, q13, q15 SUMSUB_AB q0, q15, q10, q11 // a4/a7 vshr.s16 q10, q8, #1 vshr.s16 q11, q1, #1 vadd.s16 q10, q10, q8 vadd.s16 q11, q11, q1 vadd.s16 q10, q0, q10 vadd.s16 q15, q15, q11 SUMSUB_AB q8, q12, q9, q12 SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1 SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1 SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1 .endm function sub8x8_dct8_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 vld1.64 {d17}, [r2,:64], ip vsubl.u8 q8, d16, d17 vld1.64 {d18}, [r1,:64], r3 vld1.64 {d19}, [r2,:64], ip vsubl.u8 q9, d18, d19 vld1.64 {d20}, [r1,:64], r3 vld1.64 {d21}, [r2,:64], ip vsubl.u8 q10, d20, d21 vld1.64 {d22}, [r1,:64], r3 vld1.64 {d23}, [r2,:64], ip vsubl.u8 q11, d22, d23 vld1.64 {d24}, [r1,:64], r3 vld1.64 {d25}, [r2,:64], ip vsubl.u8 q12, d24, d25 vld1.64 {d26}, [r1,:64], r3 vld1.64 {d27}, [r2,:64], ip vsubl.u8 q13, d26, d27 vld1.64 {d28}, [r1,:64], r3 vld1.64 {d29}, [r2,:64], ip vsubl.u8 q14, d28, d29 vld1.64 {d30}, [r1,:64], r3 vld1.64 {d31}, [r2,:64], ip vsubl.u8 q15, d30, d31 DCT8_1D row vswp d17, d24 // 8, 12 vswp d21, d28 // 10,14 vtrn.32 q8, q10 vtrn.32 q12, q14 vswp d19, d26 // 9, 13 vswp d23, d30 // 11,15 vtrn.32 q9, q11 vtrn.32 q13, q15 vtrn.16 q10, q11 vtrn.16 q12, q13 vtrn.16 q8, q9 vtrn.16 q14, q15 DCT8_1D col vst1.64 {d16-d19}, [r0,:128]! vst1.64 {d20-d23}, [r0,:128]! vst1.64 {d24-d27}, [r0,:128]! vst1.64 {d28-d31}, [r0,:128]! bx lr endfunc function sub16x16_dct8_neon push {lr} bl X(sub8x8_dct8_neon) sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 bl X(sub8x8_dct8_neon) sub r1, r1, #8 sub r2, r2, #8 bl X(sub8x8_dct8_neon) pop {lr} sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 b X(sub8x8_dct8_neon) endfunc // First part of IDCT (minus final SUMSUB_BA) .macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3 SUMSUB_AB \d4, \d5, \d0, \d2 vshr.s16 \d7, \d1, #1 vshr.s16 \d6, \d3, #1 vsub.s16 \d7, \d7, \d3 vadd.s16 \d6, \d6, \d1 .endm function add4x4_idct_neon mov r2, #FDEC_STRIDE vld1.64 {d0-d3}, [r1,:128] IDCT_1D d4, d5, d6, d7, d0, d1, d2, d3 vld1.32 {d30[0]}, [r0,:32], r2 SUMSUB_AB q0, q1, q2, q3 TRANSPOSE4x4_16 d0, d1, d3, d2 IDCT_1D d4, d5, d6, d7, d0, d1, d3, d2 vld1.32 {d30[1]}, [r0,:32], r2 SUMSUB_AB q0, q1, q2, q3 vrshr.s16 q0, q0, #6 vld1.32 {d31[1]}, [r0,:32], r2 vrshr.s16 q1, q1, #6 vld1.32 {d31[0]}, [r0,:32], r2 sub r0, r0, r2, lsl #2 vaddw.u8 q0, q0, d30 vaddw.u8 q1, q1, d31 vqmovun.s16 d0, q0 vqmovun.s16 d2, q1 vst1.32 {d0[0]}, [r0,:32], r2 vst1.32 {d0[1]}, [r0,:32], r2 vst1.32 {d2[1]}, [r0,:32], r2 vst1.32 {d2[0]}, [r0,:32], r2 bx lr endfunc function add8x4_idct_neon, export=0 vld1.64 {d0-d3}, [r1,:128]! IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3 vld1.64 {d4-d7}, [r1,:128]! IDCT_1D d17, d19, d21, d23, d4, d5, d6, d7 SUMSUB_AB q0, q3, q8, q10 SUMSUB_AB q1, q2, q9, q11 TRANSPOSE4x4_16 q0, q1, q2, q3 IDCT_1D q8, q9, q10, q11, q0, q1, q2, q3 SUMSUB_AB q0, q3, q8, q10 SUMSUB_AB q1, q2, q9, q11 vrshr.s16 q0, q0, #6 vld1.32 {d28}, [r0,:64], r2 vrshr.s16 q1, q1, #6 vld1.32 {d29}, [r0,:64], r2 vrshr.s16 q2, q2, #6 vld1.32 {d30}, [r0,:64], r2 vrshr.s16 q3, q3, #6 vld1.32 {d31}, [r0,:64], r2 sub r0, r0, r2, lsl #2 vaddw.u8 q0, q0, d28 vaddw.u8 q1, q1, d29 vaddw.u8 q2, q2, d30 vaddw.u8 q3, q3, d31 vqmovun.s16 d0, q0 vqmovun.s16 d1, q1 vst1.32 {d0}, [r0,:64], r2 vqmovun.s16 d2, q2 vst1.32 {d1}, [r0,:64], r2 vqmovun.s16 d3, q3 vst1.32 {d2}, [r0,:64], r2 vst1.32 {d3}, [r0,:64], r2 bx lr endfunc function add8x8_idct_neon mov r2, #FDEC_STRIDE mov ip, lr bl add8x4_idct_neon mov lr, ip b add8x4_idct_neon endfunc function add16x16_idct_neon mov r2, #FDEC_STRIDE mov ip, lr bl add8x4_idct_neon bl add8x4_idct_neon sub r0, r0, #8*FDEC_STRIDE-8 bl add8x4_idct_neon bl add8x4_idct_neon sub r0, r0, #8 bl add8x4_idct_neon bl add8x4_idct_neon sub r0, r0, #8*FDEC_STRIDE-8 bl add8x4_idct_neon mov lr, ip b add8x4_idct_neon endfunc .macro IDCT8_1D type .ifc \type, col vswp d21, d28 .endif SUMSUB_AB q0, q1, q8, q12 // a0/a2 .ifc \type, row vld1.64 {d28-d31}, [r1,:128]! .else vswp d19, d26 .endif SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4 .ifc \type, col vswp d23, d30 .endif SUMSUB_AB q8, q10, q13, q11 SUMSUB_15 q8, q10, q9, q15, q12, q14 // a7/a1 SUMSUB_AB q14, q15, q15, q9 SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3 SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5 SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7 SUMSUB_AB q10, q2, q0, q2 // b0/b6 SUMSUB_AB q11, q3, q1, q3 // b2/b4 SUMSUB_AB q8, q15, q10, q15 SUMSUB_AB q9, q14, q11, q14 SUMSUB_AB q10, q13, q3, q13 .ifc \type, row vtrn.16 q8, q9 .endif SUMSUB_AB q11, q12, q2, q12 .endm function add8x8_idct8_neon mov r2, #FDEC_STRIDE vld1.64 {d16-d19}, [r1,:128]! vld1.64 {d20-d23}, [r1,:128]! vld1.64 {d24-d27}, [r1,:128]! IDCT8_1D row vtrn.16 q10, q11 vtrn.16 q12, q13 vtrn.16 q14, q15 vtrn.32 q8, q10 vtrn.32 q9, q11 vtrn.32 q12, q14 vtrn.32 q13, q15 vswp d17, d24 IDCT8_1D col vld1.64 {d0}, [r0,:64], r2 vrshr.s16 q8, q8, #6 vld1.64 {d1}, [r0,:64], r2 vrshr.s16 q9, q9, #6 vld1.64 {d2}, [r0,:64], r2 vrshr.s16 q10, q10, #6 vld1.64 {d3}, [r0,:64], r2 vrshr.s16 q11, q11, #6 vld1.64 {d4}, [r0,:64], r2 vrshr.s16 q12, q12, #6 vld1.64 {d5}, [r0,:64], r2 vrshr.s16 q13, q13, #6 vld1.64 {d6}, [r0,:64], r2 vrshr.s16 q14, q14, #6 vld1.64 {d7}, [r0,:64], r2 vrshr.s16 q15, q15, #6 sub r0, r0, r2, lsl #3 vaddw.u8 q8, q8, d0 vaddw.u8 q9, q9, d1 vaddw.u8 q10, q10, d2 vqmovun.s16 d0, q8 vqmovun.s16 d1, q9 vqmovun.s16 d2, q10 vaddw.u8 q11, q11, d3 vst1.64 {d0}, [r0,:64], r2 vaddw.u8 q12, q12, d4 vst1.64 {d1}, [r0,:64], r2 vaddw.u8 q13, q13, d5 vst1.64 {d2}, [r0,:64], r2 vqmovun.s16 d3, q11 vqmovun.s16 d4, q12 vaddw.u8 q14, q14, d6 vaddw.u8 q15, q15, d7 vst1.64 {d3}, [r0,:64], r2 vqmovun.s16 d5, q13 vst1.64 {d4}, [r0,:64], r2 vqmovun.s16 d6, q14 vqmovun.s16 d7, q15 vst1.64 {d5}, [r0,:64], r2 vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr endfunc function add16x16_idct8_neon mov ip, lr bl X(add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 bl X(add8x8_idct8_neon) sub r0, r0, #8 bl X(add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 mov lr, ip b X(add8x8_idct8_neon) endfunc function add8x8_idct_dc_neon mov r2, #FDEC_STRIDE vld1.64 {d16}, [r1,:64] vrshr.s16 d16, d16, #6 vld1.64 {d0}, [r0,:64], r2 vmov.i16 q15, #0 vld1.64 {d1}, [r0,:64], r2 vld1.64 {d2}, [r0,:64], r2 vdup.16 d20, d16[0] vld1.64 {d3}, [r0,:64], r2 vdup.16 d21, d16[1] vld1.64 {d4}, [r0,:64], r2 vdup.16 d22, d16[2] vld1.64 {d5}, [r0,:64], r2 vdup.16 d23, d16[3] vld1.64 {d6}, [r0,:64], r2 vsub.s16 q12, q15, q10 vld1.64 {d7}, [r0,:64], r2 vsub.s16 q13, q15, q11 sub r0, r0, #8*FDEC_STRIDE vqmovun.s16 d20, q10 vqmovun.s16 d22, q11 vqmovun.s16 d24, q12 vqmovun.s16 d26, q13 vmov d21, d20 vqadd.u8 q0, q0, q10 vmov d23, d22 vqadd.u8 q1, q1, q10 vmov d25, d24 vqadd.u8 q2, q2, q11 vmov d27, d26 vqadd.u8 q3, q3, q11 vqsub.u8 q0, q0, q12 vqsub.u8 q1, q1, q12 vqsub.u8 q2, q2, q13 vst1.64 {d0}, [r0,:64], r2 vqsub.u8 q3, q3, q13 vst1.64 {d1}, [r0,:64], r2 vst1.64 {d2}, [r0,:64], r2 vst1.64 {d3}, [r0,:64], r2 vst1.64 {d4}, [r0,:64], r2 vst1.64 {d5}, [r0,:64], r2 vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr endfunc .macro ADD16x4_IDCT_DC dc vld1.64 {d16-d17}, [r0,:128], r3 vld1.64 {d18-d19}, [r0,:128], r3 vdup.16 d4, \dc[0] vdup.16 d5, \dc[1] vld1.64 {d20-d21}, [r0,:128], r3 vdup.16 d6, \dc[2] vdup.16 d7, \dc[3] vld1.64 {d22-d23}, [r0,:128], r3 vsub.s16 q12, q15, q2 vsub.s16 q13, q15, q3 vqmovun.s16 d4, q2 vqmovun.s16 d5, q3 vqmovun.s16 d6, q12 vqmovun.s16 d7, q13 vqadd.u8 q8, q8, q2 vqadd.u8 q9, q9, q2 vqadd.u8 q10, q10, q2 vqadd.u8 q11, q11, q2 vqsub.u8 q8, q8, q3 vqsub.u8 q9, q9, q3 vqsub.u8 q10, q10, q3 vst1.64 {d16-d17}, [r2,:128], r3 vqsub.u8 q11, q11, q3 vst1.64 {d18-d19}, [r2,:128], r3 vst1.64 {d20-d21}, [r2,:128], r3 vst1.64 {d22-d23}, [r2,:128], r3 .endm function add16x16_idct_dc_neon mov r2, r0 mov r3, #FDEC_STRIDE vmov.i16 q15, #0 vld1.64 {d0-d3}, [r1,:64] vrshr.s16 q0, #6 vrshr.s16 q1, #6 ADD16x4_IDCT_DC d0 ADD16x4_IDCT_DC d1 ADD16x4_IDCT_DC d2 ADD16x4_IDCT_DC d3 bx lr endfunc function sub8x8_dct_dc_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 vld1.64 {d17}, [r2,:64], ip vsubl.u8 q8, d16, d17 vld1.64 {d18}, [r1,:64], r3 vld1.64 {d19}, [r2,:64], ip vsubl.u8 q9, d18, d19 vld1.64 {d20}, [r1,:64], r3 vld1.64 {d21}, [r2,:64], ip vsubl.u8 q10, d20, d21 vld1.64 {d22}, [r1,:64], r3 vadd.s16 q0, q8, q9 vld1.64 {d23}, [r2,:64], ip vsubl.u8 q11, d22, d23 vld1.64 {d24}, [r1,:64], r3 vadd.s16 q0, q0, q10 vld1.64 {d25}, [r2,:64], ip vsubl.u8 q12, d24, d25 vld1.64 {d26}, [r1,:64], r3 vadd.s16 q0, q0, q11 vld1.64 {d27}, [r2,:64], ip vsubl.u8 q13, d26, d27 vld1.64 {d28}, [r1,:64], r3 vld1.64 {d29}, [r2,:64], ip vsubl.u8 q14, d28, d29 vld1.64 {d30}, [r1,:64], r3 vadd.s16 q1, q12, q13 vld1.64 {d31}, [r2,:64], ip vsubl.u8 q15, d30, d31 vadd.s16 q1, q1, q14 vadd.s16 d4, d0, d1 vadd.s16 q1, q1, q15 vsub.s16 d5, d0, d1 vadd.s16 d6, d2, d3 vsub.s16 d7, d2, d3 vadd.s16 q0, q2, q3 vsub.s16 q1, q2, q3 vpadd.s16 d0, d0, d2 vpadd.s16 d1, d1, d3 vpadd.s16 d0, d0, d1 vst1.64 {d0}, [r0,:64] bx lr endfunc function sub8x16_dct_dc_neon mov r3, #FENC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1,:64], r3 vld1.64 {d17}, [r2,:64], ip vsubl.u8 q8, d16, d17 vld1.64 {d18}, [r1,:64], r3 vld1.64 {d19}, [r2,:64], ip vsubl.u8 q9, d18, d19 vld1.64 {d20}, [r1,:64], r3 vld1.64 {d21}, [r2,:64], ip vsubl.u8 q10, d20, d21 vld1.64 {d22}, [r1,:64], r3 vadd.s16 q0, q8, q9 vld1.64 {d23}, [r2,:64], ip vsubl.u8 q11, d22, d23 vld1.64 {d24}, [r1,:64], r3 vadd.s16 q0, q0, q10 vld1.64 {d25}, [r2,:64], ip vsubl.u8 q12, d24, d25 vld1.64 {d26}, [r1,:64], r3 vadd.s16 q0, q0, q11 vld1.64 {d27}, [r2,:64], ip vsubl.u8 q13, d26, d27 vld1.64 {d28}, [r1,:64], r3 vld1.64 {d29}, [r2,:64], ip vsubl.u8 q14, d28, d29 vld1.64 {d30}, [r1,:64], r3 vadd.s16 q1, q12, q13 vld1.64 {d31}, [r2,:64], ip vsubl.u8 q15, d30, d31 vld1.64 {d16}, [r1,:64], r3 vadd.s16 q1, q1, q14 vld1.64 {d17}, [r2,:64], ip vadd.s16 q1, q1, q15 vld1.64 {d18}, [r1,:64], r3 vsubl.u8 q8, d16, d17 vld1.64 {d19}, [r2,:64], ip vsubl.u8 q9, d18, d19 vld1.64 {d20}, [r1,:64], r3 vld1.64 {d21}, [r2,:64], ip vsubl.u8 q10, d20, d21 vld1.64 {d22}, [r1,:64], r3 vadd.s16 q2, q8, q9 vld1.64 {d23}, [r2,:64], ip vsubl.u8 q11, d22, d23 vld1.64 {d24}, [r1,:64], r3 vadd.s16 q2, q2, q10 vld1.64 {d25}, [r2,:64], ip vsubl.u8 q12, d24, d25 vld1.64 {d26}, [r1,:64], r3 vadd.s16 q2, q2, q11 vld1.64 {d27}, [r2,:64], ip vsubl.u8 q13, d26, d27 vld1.64 {d28}, [r1,:64], r3 vld1.64 {d29}, [r2,:64], ip vsubl.u8 q14, d28, d29 vld1.64 {d30}, [r1,:64], r3 vadd.s16 q3, q12, q13 vld1.64 {d31}, [r2,:64], ip vsubl.u8 q15, d30, d31 vadd.s16 q3, q3, q14 vadd.s16 d16, d0, d1 @ b0 vadd.s16 q3, q3, q15 vsub.s16 d17, d0, d1 @ b4 vadd.s16 d18, d2, d3 @ b1 vsub.s16 d19, d2, d3 @ b5 vadd.s16 d20, d4, d5 @ b2 vsub.s16 d21, d4, d5 @ b6 vadd.s16 d22, d6, d7 @ b3 vsub.s16 d23, d6, d7 @ b7 vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2 vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6 vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3 vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7 vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3 vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3 vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7 vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7 vpadd.s16 d0, d16, d17 vpadd.s16 d1, d18, d19 vpadd.s16 d2, d20, d21 vpadd.s16 d3, d22, d23 vpadd.s16 d0, d0, d1 vpadd.s16 d1, d2, d3 vst1.64 {q0}, [r0,:64] bx lr endfunc function zigzag_scan_4x4_frame_neon movrel r2, scan4x4_frame vld1.64 {d0-d3}, [r1,:128] vld1.64 {d16-d19}, [r2,:128] vtbl.8 d4, {d0-d1}, d16 vtbl.8 d5, {d1-d3}, d17 vtbl.8 d6, {d0-d2}, d18 vtbl.8 d7, {d2-d3}, d19 vst1.64 {d4-d7}, [r0,:128] bx lr endfunc x264-master/common/arm/dct.h000066400000000000000000000067471502133446700160670ustar00rootroot00000000000000/***************************************************************************** * dct.h: arm transform and zigzag ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_DCT_H #define X264_ARM_DCT_H #define x264_dct4x4dc_neon x264_template(dct4x4dc_neon) void x264_dct4x4dc_neon( int16_t d[16] ); #define x264_idct4x4dc_neon x264_template(idct4x4dc_neon) void x264_idct4x4dc_neon( int16_t d[16] ); #define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon) void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon) void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon) void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_add4x4_idct_neon x264_template(add4x4_idct_neon) void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] ); #define x264_add8x8_idct_neon x264_template(add8x8_idct_neon) void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] ); #define x264_add16x16_idct_neon x264_template(add16x16_idct_neon) void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] ); #define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon) void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); #define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon) void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); #define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon) void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon) void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon) void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon) void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); #define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon) void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] ); #define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon) void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); #define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon) void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); #endif x264-master/common/arm/deblock-a.S000066400000000000000000000610111502133446700170720ustar00rootroot00000000000000/***************************************************************************** * deblock.S: arm deblocking ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Mans Rullgard * Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" .macro h264_loop_filter_start ldr ip, [sp] ldr ip, [ip] vdup.32 d24, ip and ip, ip, ip, lsl #16 ands ip, ip, ip, lsl #8 bxlt lr .endm .macro align_push_regs and ip, sp, #15 add ip, ip, #32 sub sp, sp, ip vst1.64 {d12-d15}, [sp,:128] sub sp, sp, #32 vst1.64 {d8-d11}, [sp,:128] .endm .macro align_pop_regs vld1.64 {d8-d11}, [sp,:128]! vld1.64 {d12-d15}, [sp,:128], ip .endm .macro h264_loop_filter_luma vdup.8 q11, r2 @ alpha vmovl.u8 q12, d24 vabd.u8 q6, q8, q0 @ abs(p0 - q0) vmovl.u16 q12, d24 vabd.u8 q14, q9, q8 @ abs(p1 - p0) vsli.16 q12, q12, #8 vabd.u8 q15, q1, q0 @ abs(q1 - q0) vsli.32 q12, q12, #16 vclt.u8 q6, q6, q11 @ < alpha vdup.8 q11, r3 @ beta vclt.s8 q7, q12, #0 vclt.u8 q14, q14, q11 @ < beta vclt.u8 q15, q15, q11 @ < beta vbic q6, q6, q7 vabd.u8 q4, q10, q8 @ abs(p2 - p0) vand q6, q6, q14 vabd.u8 q5, q2, q0 @ abs(q2 - q0) vclt.u8 q4, q4, q11 @ < beta vand q6, q6, q15 vclt.u8 q5, q5, q11 @ < beta vand q4, q4, q6 vand q5, q5, q6 vand q12, q12, q6 vrhadd.u8 q14, q8, q0 vsub.i8 q6, q12, q4 vqadd.u8 q7, q9, q12 vhadd.u8 q10, q10, q14 vsub.i8 q6, q6, q5 vhadd.u8 q14, q2, q14 vmin.u8 q7, q7, q10 vqsub.u8 q11, q9, q12 vqadd.u8 q2, q1, q12 vmax.u8 q7, q7, q11 vqsub.u8 q11, q1, q12 vmin.u8 q14, q2, q14 vmovl.u8 q2, d0 vmax.u8 q14, q14, q11 vmovl.u8 q10, d1 vsubw.u8 q2, q2, d16 vsubw.u8 q10, q10, d17 vshl.i16 q2, q2, #2 vshl.i16 q10, q10, #2 vaddw.u8 q2, q2, d18 vaddw.u8 q10, q10, d19 vsubw.u8 q2, q2, d2 vsubw.u8 q10, q10, d3 vrshrn.i16 d4, q2, #3 vrshrn.i16 d5, q10, #3 vbsl q4, q7, q9 vbsl q5, q14, q1 vneg.s8 q7, q6 vmovl.u8 q14, d16 vmin.s8 q2, q2, q6 vmovl.u8 q6, d17 vmax.s8 q2, q2, q7 vmovl.u8 q11, d0 vmovl.u8 q12, d1 vaddw.s8 q14, q14, d4 vaddw.s8 q6, q6, d5 vsubw.s8 q11, q11, d4 vsubw.s8 q12, q12, d5 vqmovun.s16 d16, q14 vqmovun.s16 d17, q6 vqmovun.s16 d0, q11 vqmovun.s16 d1, q12 .endm function deblock_v_luma_neon h264_loop_filter_start vld1.64 {d0, d1}, [r0,:128], r1 vld1.64 {d2, d3}, [r0,:128], r1 vld1.64 {d4, d5}, [r0,:128], r1 sub r0, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vld1.64 {d20,d21}, [r0,:128], r1 vld1.64 {d18,d19}, [r0,:128], r1 vld1.64 {d16,d17}, [r0,:128], r1 align_push_regs h264_loop_filter_luma sub r0, r0, r1, lsl #1 vst1.64 {d8, d9}, [r0,:128], r1 vst1.64 {d16,d17}, [r0,:128], r1 vst1.64 {d0, d1}, [r0,:128], r1 vst1.64 {d10,d11}, [r0,:128] align_pop_regs bx lr endfunc function deblock_h_luma_neon h264_loop_filter_start sub r0, r0, #4 vld1.64 {d6}, [r0], r1 vld1.64 {d20}, [r0], r1 vld1.64 {d18}, [r0], r1 vld1.64 {d16}, [r0], r1 vld1.64 {d0}, [r0], r1 vld1.64 {d2}, [r0], r1 vld1.64 {d4}, [r0], r1 vld1.64 {d26}, [r0], r1 vld1.64 {d7}, [r0], r1 vld1.64 {d21}, [r0], r1 vld1.64 {d19}, [r0], r1 vld1.64 {d17}, [r0], r1 vld1.64 {d1}, [r0], r1 vld1.64 {d3}, [r0], r1 vld1.64 {d5}, [r0], r1 vld1.64 {d27}, [r0], r1 TRANSPOSE8x8 q3, q10, q9, q8, q0, q1, q2, q13 align_push_regs h264_loop_filter_luma TRANSPOSE4x4 q4, q8, q0, q5 sub r0, r0, r1, lsl #4 add r0, r0, #2 vst1.32 {d8[0]}, [r0], r1 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d10[0]}, [r0], r1 vst1.32 {d8[1]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 vst1.32 {d0[1]}, [r0], r1 vst1.32 {d10[1]}, [r0], r1 vst1.32 {d9[0]}, [r0], r1 vst1.32 {d17[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1 vst1.32 {d11[0]}, [r0], r1 vst1.32 {d9[1]}, [r0], r1 vst1.32 {d17[1]}, [r0], r1 vst1.32 {d1[1]}, [r0], r1 vst1.32 {d11[1]}, [r0], r1 align_pop_regs bx lr endfunc .macro h264_loop_filter_luma_intra vdup.8 q14, r2 @ alpha vabd.u8 q4, q8, q0 @ abs(p0 - q0) vabd.u8 q5, q9, q8 @ abs(p1 - p0) vabd.u8 q6, q1, q0 @ abs(q1 - q0) vdup.8 q15, r3 @ beta vmov.u8 q13, #2 vclt.u8 q7, q4, q14 @ < alpha vshr.u8 q14, q14, #2 @ alpha >> 2 vclt.u8 q5, q5, q15 @ < beta vadd.u8 q14, q14, q13 @ (alpha >> 2) + 2 vand q7, q7, q5 vclt.u8 q6, q6, q15 @ < beta vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2 vand q12, q7, q6 @ if_1 vshrn.u16 d28, q12, #4 vmov r2, lr, d28 orrs r2, r2, lr beq 9f sub sp, sp, #32 vst1.8 {q12-q13}, [sp,:128] vshll.u8 q4, d18, #1 @ 2*p1 vshll.u8 q5, d19, #1 vaddw.u8 q4, q4, d16 @ 2*p1 + p0 vaddw.u8 q5, q5, d17 vaddw.u8 q4, q4, d2 @ 2*p1 + p0 + q1 vaddw.u8 q5, q5, d3 vrshrn.u16 d24, q4, #2 vrshrn.u16 d25, q5, #2 vaddl.u8 q6, d20, d16 @ p2 + p0 vaddl.u8 q7, d21, d17 vaddw.u8 q6, q6, d0 @ p2 + p0 + q0 vaddw.u8 q7, q7, d1 vadd.u16 q4, q4, q6 @ p2 + 2*p1 + 2*p0 + q0 + q1 vadd.u16 q5, q5, q7 vaddw.u8 q4, q4, d0 @ p2 + 2*p1 + 2*p0 + 2*q0 + q1 vaddw.u8 q5, q5, d1 vrshrn.u16 d26, q4, #3 @ p0'_2 vrshrn.u16 d27, q5, #3 vaddw.u8 q6, q6, d18 @ p2 + p1 + p0 + q0 vaddw.u8 q7, q7, d19 vrshrn.u16 d28, q6, #2 @ p1'_2 vrshrn.u16 d29, q7, #2 vaddl.u8 q4, d22, d20 @ p3 + p2 vaddl.u8 q5, d23, d21 vshl.u16 q4, q4, #1 @ 2*p3 + 2*p2 vshl.u16 q5, q5, #1 vadd.u16 q4, q4, q6 @ 2*p3 + 3*p2 + p1 + p0 + q0 vadd.u16 q5, q5, q7 vrshrn.u16 d30, q4, #3 @ p2'_2 vrshrn.u16 d31, q5, #3 vdup.8 q4, r3 @ beta vabd.u8 q5, q10, q8 @ abs(p2 - p0) vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2 vclt.u8 q5, q5, q4 @ < beta if_3 vand q7, q7, q5 @ if_2 && if_3 vmvn q4, q7 vand q7, q7, q6 @ if_1 && if_2 && if_3 vand q6, q4, q6 @ if_1 && !(if_2 && if_3) @ copy p0 to q15 so it can be clobbered vbit q10, q15, q7 vmov q15, q8 vbit q8, q12, q6 @ wait for q9 to clobber vshll.u8 q4, d2, #1 @ 2*q1 vshll.u8 q5, d3, #1 vbit q8, q12, q6 vaddw.u8 q4, q4, d0 @ 2*q1 + q0 vaddw.u8 q5, q5, d1 vbit q8, q13, q7 vaddw.u8 q4, q4, d18 @ 2*q1 + q0 + p1 vaddw.u8 q5, q5, d19 vbit q9, q14, q7 vrshrn.u16 d24, q4, #2 vrshrn.u16 d25, q5, #2 vaddl.u8 q6, d4, d0 @ q2 + q0 vaddl.u8 q7, d5, d1 vaddw.u8 q6, q6, d30 @ q2 + q0 + p0 vaddw.u8 q7, q7, d31 vadd.u16 q4, q4, q6 @ q2 + 2*q1 + 2*q0 + p0 + p1 vadd.u16 q5, q5, q7 vaddw.u8 q4, q4, d30 @ q2 + 2*q1 + 2*q0 + 2*p0 + p1 vaddw.u8 q5, q5, d31 vrshrn.u16 d26, q4, #3 @ q0'_2 vrshrn.u16 d27, q5, #3 vaddw.u8 q6, q6, d2 @ q2 + q1 + q0 + p0 vaddw.u8 q7, q7, d3 vrshrn.u16 d28, q6, #2 @ q1'_2 vrshrn.u16 d29, q7, #2 vaddl.u8 q4, d6, d4 @ q3 + q2 vaddl.u8 q5, d7, d5 vshl.u16 q4, q4, #1 @ 2*q3 + 2*q2 vshl.u16 q5, q5, #1 vadd.u16 q4, q4, q6 @ 2*q3 + 3*q2 + q1 + q0 + p0 vadd.u16 q5, q5, q7 vrshrn.u16 d30, q4, #3 @ q2'_2 vrshrn.u16 d31, q5, #3 vdup.8 q4, r3 @ beta vabd.u8 q5, q2, q0 @ abs(q2 - q0) vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2 vclt.u8 q5, q5, q4 @ < beta if_4 vand q7, q7, q5 @ if_2 && if_4 vmvn q4, q7 vand q7, q6, q7 @ if_1 && if_2 && if_4 vand q6, q6, q4 @ if_1 && !(if_2 && if_4) vbit q0, q12, q6 vbit q1, q14, q7 vbit q0, q13, q7 vbit q2, q15, q7 .endm function deblock_v_luma_intra_neon push {lr} vld1.64 {d0, d1}, [r0,:128], r1 vld1.64 {d2, d3}, [r0,:128], r1 vld1.64 {d4, d5}, [r0,:128], r1 vld1.64 {d6, d7}, [r0,:128], r1 sub r0, r0, r1, lsl #3 vld1.64 {d22,d23}, [r0,:128], r1 vld1.64 {d20,d21}, [r0,:128], r1 vld1.64 {d18,d19}, [r0,:128], r1 vld1.64 {d16,d17}, [r0,:128] align_push_regs h264_loop_filter_luma_intra sub r0, r0, r1, lsl #1 vst1.64 {d20,d21}, [r0,:128], r1 vst1.64 {d18,d19}, [r0,:128], r1 vst1.64 {d16,d17}, [r0,:128], r1 vst1.64 {d0, d1}, [r0,:128], r1 vst1.64 {d2, d3}, [r0,:128], r1 vst1.64 {d4, d5}, [r0,:128] 9: align_pop_regs pop {pc} endfunc function deblock_h_luma_intra_neon push {lr} sub r0, r0, #4 vld1.64 {d22}, [r0], r1 vld1.64 {d20}, [r0], r1 vld1.64 {d18}, [r0], r1 vld1.64 {d16}, [r0], r1 vld1.64 {d0}, [r0], r1 vld1.64 {d2}, [r0], r1 vld1.64 {d4}, [r0], r1 vld1.64 {d6}, [r0], r1 vld1.64 {d23}, [r0], r1 vld1.64 {d21}, [r0], r1 vld1.64 {d19}, [r0], r1 vld1.64 {d17}, [r0], r1 vld1.64 {d1}, [r0], r1 vld1.64 {d3}, [r0], r1 vld1.64 {d5}, [r0], r1 vld1.64 {d7}, [r0], r1 TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3 align_push_regs h264_loop_filter_luma_intra TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3 sub r0, r0, r1, lsl #4 vst1.64 {d22}, [r0], r1 vst1.64 {d20}, [r0], r1 vst1.64 {d18}, [r0], r1 vst1.64 {d16}, [r0], r1 vst1.64 {d0}, [r0], r1 vst1.64 {d2}, [r0], r1 vst1.64 {d4}, [r0], r1 vst1.64 {d6}, [r0], r1 vst1.64 {d23}, [r0], r1 vst1.64 {d21}, [r0], r1 vst1.64 {d19}, [r0], r1 vst1.64 {d17}, [r0], r1 vst1.64 {d1}, [r0], r1 vst1.64 {d3}, [r0], r1 vst1.64 {d5}, [r0], r1 vst1.64 {d7}, [r0], r1 9: align_pop_regs pop {pc} endfunc .macro h264_loop_filter_chroma vdup.8 q11, r2 // alpha vmovl.u8 q12, d24 vabd.u8 q13, q8, q0 // abs(p0 - q0) vabd.u8 q14, q9, q8 // abs(p1 - p0) vsubl.u8 q2, d0, d16 vsubl.u8 q3, d1, d17 vsli.16 q12, q12, #8 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 vabd.u8 q15, q1, q0 // abs(q1 - q0) vmovl.u8 q12, d24 vaddw.u8 q2, q2, d18 vaddw.u8 q3, q3, d19 vclt.u8 q13, q13, q11 // < alpha vsubw.u8 q2, q2, d2 vsubw.u8 q3, q3, d3 vsli.16 q12, q12, #8 vdup.8 q11, r3 // beta vclt.s8 q10, q12, #0 vrshrn.i16 d4, q2, #3 vrshrn.i16 d5, q3, #3 vclt.u8 q14, q14, q11 // < beta vbic q13, q13, q10 vclt.u8 q15, q15, q11 // < beta vand q13, q13, q14 vneg.s8 q10, q12 vand q13, q13, q15 vmin.s8 q2, q2, q12 vmovl.u8 q14, d16 vand q2, q2, q13 vmovl.u8 q15, d17 vmax.s8 q2, q2, q10 vmovl.u8 q11, d0 vmovl.u8 q12, d1 vaddw.s8 q14, q14, d4 vaddw.s8 q15, q15, d5 vsubw.s8 q11, q11, d4 vsubw.s8 q12, q12, d5 vqmovun.s16 d16, q14 vqmovun.s16 d17, q15 vqmovun.s16 d0, q11 vqmovun.s16 d1, q12 .endm function deblock_v_chroma_neon h264_loop_filter_start sub r0, r0, r1, lsl #1 vld1.8 {d18,d19}, [r0,:128], r1 vld1.8 {d16,d17}, [r0,:128], r1 vld1.8 {d0, d1}, [r0,:128], r1 vld1.8 {d2, d3}, [r0,:128] h264_loop_filter_chroma sub r0, r0, r1, lsl #1 vst1.8 {d16,d17}, [r0,:128], r1 vst1.8 {d0, d1}, [r0,:128], r1 bx lr endfunc function deblock_h_chroma_neon h264_loop_filter_start sub r0, r0, #4 deblock_h_chroma: vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 vld1.8 {d0}, [r0], r1 vld1.8 {d2}, [r0], r1 vld1.8 {d19}, [r0], r1 vld1.8 {d17}, [r0], r1 vld1.8 {d1}, [r0], r1 vld1.8 {d3}, [r0], r1 TRANSPOSE4x4_16 q9, q8, q0, q1 h264_loop_filter_chroma vtrn.16 q8, q0 sub r0, r0, r1, lsl #3 add r0, r0, #2 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 vst1.32 {d0[1]}, [r0], r1 vst1.32 {d17[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1 vst1.32 {d17[1]}, [r0], r1 vst1.32 {d1[1]}, [r0], r1 bx lr endfunc function deblock_h_chroma_422_neon h264_loop_filter_start push {lr} sub r0, r0, #4 add r1, r1, r1 bl deblock_h_chroma ldr ip, [sp, #4] ldr ip, [ip] vdup.32 d24, ip sub r0, r0, r1, lsl #3 add r0, r0, r1, lsr #1 sub r0, r0, #2 pop {lr} b deblock_h_chroma endfunc .macro h264_loop_filter_chroma8 vdup.8 d22, r2 @ alpha vmovl.u8 q12, d24 vabd.u8 d26, d16, d0 @ abs(p0 - q0) vabd.u8 d28, d18, d16 @ abs(p1 - p0) vsubl.u8 q2, d0, d16 vsli.16 d24, d24, #8 vshl.i16 q2, q2, #2 vabd.u8 d30, d2, d0 @ abs(q1 - q0) vaddw.u8 q2, q2, d18 vclt.u8 d26, d26, d22 @ < alpha vsubw.u8 q2, q2, d2 vdup.8 d22, r3 @ beta vclt.s8 d20, d24, #0 vrshrn.i16 d4, q2, #3 vclt.u8 d28, d28, d22 @ < beta vbic d26, d26, d20 vclt.u8 d30, d30, d22 @ < beta vand d26, d26, d28 vneg.s8 d20, d24 vand d26, d26, d30 vmin.s8 d4, d4, d24 vmovl.u8 q14, d16 vand d4, d4, d26 vmax.s8 d4, d4, d20 vmovl.u8 q11, d0 vaddw.s8 q14, q14, d4 vsubw.s8 q11, q11, d4 vqmovun.s16 d16, q14 vqmovun.s16 d0, q11 .endm function deblock_h_chroma_mbaff_neon h264_loop_filter_start sub r0, r0, #4 vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 vld1.8 {d0}, [r0], r1 vld1.8 {d2}, [r0], r1 TRANSPOSE4x4_16 d18, d16, d0, d2 h264_loop_filter_chroma8 vtrn.16 d16, d0 sub r0, r0, r1, lsl #2 add r0, r0, #2 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 vst1.32 {d0[1]}, [r0] bx lr endfunc .macro h264_loop_filter_chroma_intra, width=16 vdup.8 q11, r2 @ alpha vabd.u8 q13, q8, q0 @ abs(p0 - q0) vabd.u8 q14, q9, q8 @ abs(p1 - p0) vabd.u8 q15, q1, q0 @ abs(q1 - q0) vclt.u8 q13, q13, q11 @ < alpha vdup.8 q11, r3 @ beta vclt.u8 q14, q14, q11 @ < beta vclt.u8 q15, q15, q11 @ < beta vand q13, q13, q14 vand q13, q13, q15 vshll.u8 q14, d18, #1 vshll.u8 q2, d2, #1 .ifc \width, 16 vshll.u8 q15, d19, #1 vshll.u8 q3, d3, #1 vaddl.u8 q12, d17, d3 vaddl.u8 q10, d1, d19 .endif vaddl.u8 q11, d16, d2 vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1 vadd.u16 q14, q14, q11 vadd.u16 q2, q2, q1 .ifc \width, 16 vadd.u16 q15, q15, q12 vadd.u16 q3, q3, q10 .endif vqrshrn.u16 d28, q14, #2 vqrshrn.u16 d4, q2, #2 .ifc \width, 16 vqrshrn.u16 d29, q15, #2 vqrshrn.u16 d5, q3, #2 .endif vbit q8, q14, q13 vbit q0, q2, q13 .endm function deblock_v_chroma_intra_neon sub r0, r0, r1, lsl #1 vld2.8 {d18,d19}, [r0,:128], r1 vld2.8 {d16,d17}, [r0,:128], r1 vld2.8 {d0, d1}, [r0,:128], r1 vld2.8 {d2, d3}, [r0,:128] h264_loop_filter_chroma_intra sub r0, r0, r1, lsl #1 vst2.8 {d16,d17}, [r0,:128], r1 vst2.8 {d0, d1}, [r0,:128], r1 bx lr endfunc function deblock_h_chroma_intra_neon sub r0, r0, #4 vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 vld1.8 {d0}, [r0], r1 vld1.8 {d2}, [r0], r1 vld1.8 {d19}, [r0], r1 vld1.8 {d17}, [r0], r1 vld1.8 {d1}, [r0], r1 vld1.8 {d3}, [r0], r1 TRANSPOSE4x4_16 q9, q8, q0, q1 h264_loop_filter_chroma_intra vtrn.16 q8, q0 sub r0, r0, r1, lsl #3 add r0, r0, #2 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 vst1.32 {d0[1]}, [r0], r1 vst1.32 {d17[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1 vst1.32 {d17[1]}, [r0], r1 vst1.32 {d1[1]}, [r0], r1 bx lr endfunc function deblock_h_chroma_422_intra_neon push {lr} bl X(deblock_h_chroma_intra_neon) add r0, r0, #2 pop {lr} b X(deblock_h_chroma_intra_neon) endfunc function deblock_h_chroma_intra_mbaff_neon sub r0, r0, #4 vld1.8 {d18}, [r0], r1 vld1.8 {d16}, [r0], r1 vld1.8 {d0}, [r0], r1 vld1.8 {d2}, [r0], r1 TRANSPOSE4x4_16 d18, d16, d0, d2 h264_loop_filter_chroma_intra width=8 vtrn.16 d16, d0 sub r0, r0, r1, lsl #2 add r0, r0, #2 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 vst1.32 {d0[1]}, [r0] bx lr endfunc function deblock_strength_neon ldr ip, [sp] vmov.i8 q8, #0 lsl ip, ip, #8 add r3, r3, #32 sub ip, ip, #(1<<8)-3 vmov.i8 q9, #0 vdup.16 q10, ip ldr ip, [sp, #4] lists: @ load bytes ref vld1.8 {d31}, [r1]! add r2, r2, #16 vld1.8 {q1}, [r1]! vmov.i8 q0, #0 vld1.8 {q2}, [r1]! vext.8 q3, q0, q1, #15 vext.8 q0, q0, q2, #15 vuzp.32 q1, q2 vuzp.32 q3, q0 vext.8 q1, q15, q2, #12 veor q0, q0, q2 veor q1, q1, q2 vorr q8, q8, q0 vorr q9, q9, q1 vld1.16 {q11}, [r2,:128]! @ mv + 0x10 vld1.16 {q3}, [r2,:128]! @ mv + 0x20 vld1.16 {q12}, [r2,:128]! @ mv + 0x30 vld1.16 {q2}, [r2,:128]! @ mv + 0x40 vld1.16 {q13}, [r2,:128]! @ mv + 0x50 vext.8 q3, q3, q12, #12 vext.8 q2, q2, q13, #12 vabd.s16 q0, q12, q3 vld1.16 {q3}, [r2,:128]! @ mv + 0x60 vabd.s16 q1, q13, q2 vld1.16 {q14}, [r2,:128]! @ mv + 0x70 vqmovn.u16 d0, q0 vld1.16 {q2}, [r2,:128]! @ mv + 0x80 vld1.16 {q15}, [r2,:128]! @ mv + 0x90 vqmovn.u16 d1, q1 vext.8 q3, q3, q14, #12 vext.8 q2, q2, q15, #12 vabd.s16 q3, q14, q3 vabd.s16 q2, q15, q2 vqmovn.u16 d2, q3 vqmovn.u16 d3, q2 vqsub.u8 q0, q0, q10 vqsub.u8 q1, q1, q10 vqmovn.u16 d0, q0 vqmovn.u16 d1, q1 vabd.s16 q1, q12, q13 vorr q8, q8, q0 vabd.s16 q0, q11, q12 vabd.s16 q2, q13, q14 vabd.s16 q3, q14, q15 vqmovn.u16 d0, q0 vqmovn.u16 d1, q1 vqmovn.u16 d2, q2 vqmovn.u16 d3, q3 vqsub.u8 q0, q0, q10 vqsub.u8 q1, q1, q10 vqmovn.u16 d0, q0 vqmovn.u16 d1, q1 subs ip, ip, #1 vorr q9, q9, q0 beq lists mov ip, #-32 @ load bytes nnz vld1.8 {d31}, [r0]! vld1.8 {q1}, [r0]! vmov.i8 q0, #0 vld1.8 {q2}, [r0] vext.8 q3, q0, q1, #15 vext.8 q0, q0, q2, #15 vuzp.32 q1, q2 vuzp.32 q3, q0 vext.8 q1, q15, q2, #12 vorr q0, q0, q2 vorr q1, q1, q2 vmov.u8 q10, #1 vmin.u8 q0, q0, q10 vmin.u8 q1, q1, q10 vmin.u8 q8, q8, q10 @ mv ? 1 : 0 vmin.u8 q9, q9, q10 vadd.u8 q0, q0, q0 @ nnz ? 2 : 0 vadd.u8 q1, q1, q1 vmax.u8 q8, q8, q0 vmax.u8 q9, q9, q1 vzip.16 d16, d17 vst1.8 {q9}, [r3,:128], ip @ bs[1] vtrn.8 d16, d17 vtrn.32 d16, d17 vst1.8 {q8}, [r3,:128] @ bs[0] bx lr endfunc x264-master/common/arm/deblock.h000066400000000000000000000072331502133446700167070ustar00rootroot00000000000000/***************************************************************************** * deblock.h: arm deblocking ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_DEBLOCK_H #define X264_ARM_DEBLOCK_H #define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon) void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon) void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon) void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon) void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_strength_neon x264_template(deblock_strength_neon) void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon) void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon) void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon) void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon) void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon) void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon) void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon) void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon) void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); #endif x264-master/common/arm/mc-a.S000066400000000000000000001545571502133446700161100ustar00rootroot00000000000000/***************************************************************************** * mc.S: arm motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Mans Rullgard * Stefan Groenroos * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" const pw_0to15, align=4 .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 endconst .text // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8 // They also use nothing above armv5te, but we don't care about pre-armv6 // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) function prefetch_ref_arm sub r2, r2, #1 add r0, r0, #64 and r2, r2, r1 add r0, r0, r2, lsl #3 add r2, r1, r1, lsl #1 pld [r0] pld [r0, r1] pld [r0, r1, lsl #1] add r3, r0, r1, lsl #2 pld [r0, r2] pld [r3] pld [r3, r1] pld [r3, r1, lsl #1] pld [r3, r2] bx lr endfunc // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, // uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) function prefetch_fenc_arm ldr ip, [sp] push {lr} and lr, ip, #3 smulbb lr, lr, r1 // note: this assumes stride_y is <= 16 bits signed and ip, ip, #6 smulbb ip, ip, r3 add r0, r0, #64 add r2, r2, #64 add r0, r0, lr, lsl #2 pld [r0] add lr, r0, r1, lsl #1 pld [r0, r1] pld [lr] add r2, r2, ip, lsl #2 pld [lr, r1] pld [r2] add ip, r2, r3, lsl #1 pld [r2, r3] pld [ip] pld [ip, r3] pop {pc} endfunc // void *memcpy_aligned( void *dst, const void *src, size_t n ) function memcpy_aligned_neon orr r3, r0, r1, lsr #1 movrel ip, memcpy_table and r3, r3, #0xc ldr pc, [ip, r3] endfunc .macro MEMCPY_ALIGNED srcalign dstalign function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0 mov r3, r0 .if \srcalign == 8 && \dstalign == 8 sub r2, #16 vld1.64 {d0}, [r1,:64]! vst1.64 {d0}, [r3,:64]! .set r1align, 128 .set r3align, 128 .else .set r1align, \srcalign * 8 .set r3align, \dstalign * 8 .endif tst r2, #16 beq 32f sub r2, #16 vld1.64 {d0-d1}, [r1,:r1align]! vst1.64 {d0-d1}, [r3,:r3align]! 32: // n is a multiple of 32 tst r2, #32 beq 640f sub r2, #32 vld1.64 {d0-d3}, [r1,:r1align]! vst1.64 {d0-d3}, [r3,:r3align]! 640: // n is a multiple of 64 cmp r2, #0 beq 1f 64: subs r2, #64 vld1.64 {d0-d3}, [r1,:r1align]! vld1.64 {d4-d7}, [r1,:r1align]! vst1.64 {d0-d3}, [r3,:r3align]! vst1.64 {d4-d7}, [r3,:r3align]! bgt 64b 1: // end .if \srcalign == 8 && \dstalign == 8 vld1.64 {d0}, [r1,:64]! vst1.64 {d0}, [r3,:64]! .endif bx lr endfunc .endm MEMCPY_ALIGNED 16, 16 MEMCPY_ALIGNED 16, 8 MEMCPY_ALIGNED 8, 16 MEMCPY_ALIGNED 8, 8 const memcpy_table, align=2, relocate=1 .word memcpy_aligned_16_16_neon .word memcpy_aligned_16_8_neon .word memcpy_aligned_8_16_neon .word memcpy_aligned_8_8_neon endconst .text .ltorg // void memzero_aligned( void *dst, size_t n ) function memzero_aligned_neon vmov.i8 q0, #0 vmov.i8 q1, #0 memzero_loop: subs r1, #128 .rept 4 vst1.64 {d0-d3}, [r0,:128]! .endr bgt memzero_loop bx lr endfunc // void pixel_avg( uint8_t *dst, intptr_t dst_stride, // uint8_t *src1, intptr_t src1_stride, // uint8_t *src2, intptr_t src2_stride, int weight ); .macro AVGH w h function pixel_avg_\w\()x\h\()_neon ldr ip, [sp, #8] push {r4-r6,lr} cmp ip, #32 ldrd r4, r5, [sp, #16] mov lr, #\h beq pixel_avg_w\w\()_neon rsbs r6, ip, #64 blt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 cmp ip, #0 bge pixel_avg_weight_w\w\()_add_add_neon b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 endfunc .endm AVGH 4, 2 AVGH 4, 4 AVGH 4, 8 AVGH 4, 16 AVGH 8, 4 AVGH 8, 8 AVGH 8, 16 AVGH 16, 8 AVGH 16, 16 // 0 < weight < 64 .macro load_weights_add_add vdup.8 d30, ip vdup.8 d31, r6 .endm .macro load_add_add d1 d2 vld1.32 {\d1}, [r2], r3 vld1.32 {\d2}, [r4], r5 .endm .macro weight_add_add dst s1 s2 vmull.u8 \dst, \s1, d30 vmlal.u8 \dst, \s2, d31 .endm // weight > 64 .macro load_weights_add_sub rsb r6, #0 vdup.8 d30, ip vdup.8 d31, r6 .endm .macro load_add_sub d1 d2 vld1.32 {\d1}, [r2], r3 vld1.32 {\d2}, [r4], r5 .endm .macro weight_add_sub dst s1 s2 vmull.u8 \dst, \s1, d30 vmlsl.u8 \dst, \s2, d31 .endm // weight < 0 .macro load_weights_sub_add rsb ip, #0 vdup.8 d31, r6 vdup.8 d30, ip .endm .macro load_sub_add d1 d2 vld1.32 {\d2}, [r4], r5 vld1.32 {\d1}, [r2], r3 .endm .macro weight_sub_add dst s1 s2 vmull.u8 \dst, \s2, d31 vmlsl.u8 \dst, \s1, d30 .endm .macro AVG_WEIGHT ext function pixel_avg_weight_w4_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 load_\ext d0[], d1[] weight_\ext q8, d0, d1 load_\ext d2[], d3[] vqrshrun.s16 d0, q8, #6 weight_\ext q9, d2, d3 vst1.32 {d0[0]}, [r0,:32], r1 vqrshrun.s16 d1, q9, #6 vst1.32 {d1[0]}, [r0,:32], r1 bgt 1b pop {r4-r6,pc} endfunc function pixel_avg_weight_w8_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #4 load_\ext d0, d1 weight_\ext q8, d0, d1 load_\ext d2, d3 weight_\ext q9, d2, d3 load_\ext d4, d5 weight_\ext q10, d4, d5 load_\ext d6, d7 weight_\ext q11, d6, d7 vqrshrun.s16 d0, q8, #6 vqrshrun.s16 d1, q9, #6 vqrshrun.s16 d2, q10, #6 vqrshrun.s16 d3, q11, #6 vst1.64 {d0}, [r0,:64], r1 vst1.64 {d1}, [r0,:64], r1 vst1.64 {d2}, [r0,:64], r1 vst1.64 {d3}, [r0,:64], r1 bgt 1b pop {r4-r6,pc} endfunc function pixel_avg_weight_w16_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 load_\ext d0-d1, d2-d3 weight_\ext q8, d0, d2 weight_\ext q9, d1, d3 load_\ext d4-d5, d6-d7 weight_\ext q10, d4, d6 weight_\ext q11, d5, d7 vqrshrun.s16 d0, q8, #6 vqrshrun.s16 d1, q9, #6 vqrshrun.s16 d2, q10, #6 vqrshrun.s16 d3, q11, #6 vst1.64 {d0-d1}, [r0,:128], r1 vst1.64 {d2-d3}, [r0,:128], r1 bgt 1b pop {r4-r6,pc} endfunc .endm AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add function pixel_avg_w4_neon, export=0 subs lr, lr, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [r4], r5 vrhadd.u8 d0, d0, d2 vld1.32 {d1[]}, [r2], r3 vld1.32 {d3[]}, [r4], r5 vrhadd.u8 d1, d1, d3 vst1.32 {d0[0]}, [r0,:32], r1 vst1.32 {d1[0]}, [r0,:32], r1 bgt pixel_avg_w4_neon pop {r4-r6,pc} endfunc function pixel_avg_w8_neon, export=0 subs lr, lr, #4 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [r4], r5 vrhadd.u8 d0, d0, d2 vld1.64 {d1}, [r2], r3 vld1.64 {d3}, [r4], r5 vrhadd.u8 d1, d1, d3 vst1.64 {d0}, [r0,:64], r1 vld1.64 {d2}, [r2], r3 vld1.64 {d4}, [r4], r5 vrhadd.u8 d2, d2, d4 vst1.64 {d1}, [r0,:64], r1 vld1.64 {d3}, [r2], r3 vld1.64 {d5}, [r4], r5 vrhadd.u8 d3, d3, d5 vst1.64 {d2}, [r0,:64], r1 vst1.64 {d3}, [r0,:64], r1 bgt pixel_avg_w8_neon pop {r4-r6,pc} endfunc function pixel_avg_w16_neon, export=0 subs lr, lr, #4 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [r4], r5 vrhadd.u8 q0, q0, q1 vld1.64 {d2-d3}, [r2], r3 vld1.64 {d4-d5}, [r4], r5 vrhadd.u8 q1, q1, q2 vst1.64 {d0-d1}, [r0,:128], r1 vld1.64 {d4-d5}, [r2], r3 vld1.64 {d6-d7}, [r4], r5 vrhadd.u8 q2, q2, q3 vst1.64 {d2-d3}, [r0,:128], r1 vld1.64 {d6-d7}, [r2], r3 vld1.64 {d0-d1}, [r4], r5 vrhadd.u8 q3, q3, q0 vst1.64 {d4-d5}, [r0,:128], r1 vst1.64 {d6-d7}, [r0,:128], r1 bgt pixel_avg_w16_neon pop {r4-r6,pc} endfunc function pixel_avg2_w4_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] avg2_w4_loop: subs ip, ip, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [lr], r3 vrhadd.u8 d0, d0, d2 vld1.32 {d1[]}, [r2], r3 vld1.32 {d3[]}, [lr], r3 vrhadd.u8 d1, d1, d3 vst1.32 {d0[0]}, [r0,:32], r1 vst1.32 {d1[0]}, [r0,:32], r1 bgt avg2_w4_loop pop {pc} endfunc function pixel_avg2_w8_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] avg2_w8_loop: subs ip, ip, #2 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [lr], r3 vrhadd.u8 d0, d0, d2 vld1.64 {d1}, [r2], r3 vld1.64 {d3}, [lr], r3 vrhadd.u8 d1, d1, d3 vst1.64 {d0}, [r0,:64], r1 vst1.64 {d1}, [r0,:64], r1 bgt avg2_w8_loop pop {pc} endfunc function pixel_avg2_w16_neon ldr ip, [sp, #4] push {lr} ldr lr, [sp, #4] avg2_w16_loop: subs ip, ip, #2 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [lr], r3 vrhadd.u8 q0, q0, q1 vld1.64 {d4-d5}, [r2], r3 vld1.64 {d6-d7}, [lr], r3 vrhadd.u8 q2, q2, q3 vst1.64 {d0-d1}, [r0,:128], r1 vst1.64 {d4-d5}, [r0,:128], r1 bgt avg2_w16_loop pop {pc} endfunc function pixel_avg2_w20_neon ldr ip, [sp, #4] push {lr} sub r1, r1, #16 ldr lr, [sp, #4] avg2_w20_loop: subs ip, ip, #2 vld1.64 {d0-d2}, [r2], r3 vld1.64 {d4-d6}, [lr], r3 vrhadd.u8 q0, q0, q2 vrhadd.u8 d2, d2, d6 vld1.64 {d4-d6}, [r2], r3 vld1.64 {d16-d18},[lr], r3 vrhadd.u8 q2, q2, q8 vst1.64 {d0-d1}, [r0,:128]! vrhadd.u8 d6, d6, d18 vst1.32 {d2[0]}, [r0,:32], r1 vst1.64 {d4-d5}, [r0,:128]! vst1.32 {d6[0]}, [r0,:32], r1 bgt avg2_w20_loop pop {pc} endfunc .macro weight_prologue type push {r4-r5,lr} ldr r4, [sp, #4*3] // weight_t ldr ip, [sp, #4*3+4] // h .ifc \type, full ldr lr, [r4, #32] // denom .endif ldrd r4, r5, [r4, #32+4] // scale, offset vdup.8 d0, r4 vdup.16 q1, r5 .ifc \type, full rsb lr, lr, #0 vdup.16 q2, lr .endif .endm // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride, // const x264_weight_t *weight, int height ) function mc_weight_w20_neon weight_prologue full sub r1, #16 weight20_loop: subs ip, #2 vld1.8 {d17-d19}, [r2], r3 vmull.u8 q10, d17, d0 vmull.u8 q11, d18, d0 vld1.8 {d16-d18}, [r2], r3 vmull.u8 q12, d16, d0 vmull.u8 q13, d17, d0 vtrn.32 d19, d18 vmull.u8 q14, d19, d0 vrshl.s16 q10, q10, q2 vrshl.s16 q11, q11, q2 vrshl.s16 q12, q12, q2 vrshl.s16 q13, q13, q2 vrshl.s16 q14, q14, q2 vadd.s16 q10, q10, q1 vadd.s16 q11, q11, q1 vadd.s16 q12, q12, q1 vadd.s16 q13, q13, q1 vadd.s16 q14, q14, q1 vqmovun.s16 d16, q10 vqmovun.s16 d17, q11 vqmovun.s16 d18, q12 vqmovun.s16 d19, q13 vqmovun.s16 d20, q14 vst1.8 {d16-d17}, [r0,:128]! vst1.32 {d20[0]}, [r0,:32], r1 vst1.8 {d18-d19}, [r0,:128]! vst1.32 {d20[1]}, [r0,:32], r1 bgt weight20_loop pop {r4-r5,pc} endfunc function mc_weight_w16_neon weight_prologue full weight16_loop: subs ip, #2 vld1.8 {d16-d17}, [r2], r3 vld1.8 {d18-d19}, [r2], r3 vmull.u8 q10, d16, d0 vmull.u8 q11, d17, d0 vmull.u8 q12, d18, d0 vmull.u8 q13, d19, d0 vrshl.s16 q10, q10, q2 vrshl.s16 q11, q11, q2 vrshl.s16 q12, q12, q2 vrshl.s16 q13, q13, q2 vadd.s16 q10, q10, q1 vadd.s16 q11, q11, q1 vadd.s16 q12, q12, q1 vadd.s16 q13, q13, q1 vqmovun.s16 d16, q10 vqmovun.s16 d17, q11 vqmovun.s16 d18, q12 vqmovun.s16 d19, q13 vst1.8 {d16-d17}, [r0,:128], r1 vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_loop pop {r4-r5,pc} endfunc function mc_weight_w8_neon weight_prologue full weight8_loop: subs ip, #2 vld1.8 {d16}, [r2], r3 vld1.8 {d18}, [r2], r3 vmull.u8 q8, d16, d0 vmull.u8 q9, d18, d0 vrshl.s16 q8, q8, q2 vrshl.s16 q9, q9, q2 vadd.s16 q8, q8, q1 vadd.s16 q9, q9, q1 vqmovun.s16 d16, q8 vqmovun.s16 d18, q9 vst1.8 {d16}, [r0,:64], r1 vst1.8 {d18}, [r0,:64], r1 bgt weight8_loop pop {r4-r5,pc} endfunc function mc_weight_w4_neon weight_prologue full weight4_loop: subs ip, #2 vld1.32 {d16[0]}, [r2], r3 vld1.32 {d16[1]}, [r2], r3 vmull.u8 q8, d16, d0 vrshl.s16 q8, q8, q2 vadd.s16 q8, q8, q1 vqmovun.s16 d16, q8 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 bgt weight4_loop pop {r4-r5,pc} endfunc function mc_weight_w20_nodenom_neon weight_prologue nodenom sub r1, #16 weight20_nodenom_loop: subs ip, #2 vld1.8 {d26-d28}, [r2], r3 vmov q8, q1 vmov q9, q1 vld1.8 {d29-d31}, [r2], r3 vmov q10, q1 vmov q11, q1 vmov q12, q1 vtrn.32 d28, d31 vmlal.u8 q8, d26, d0 vmlal.u8 q9, d27, d0 vmlal.u8 q10, d29, d0 vmlal.u8 q11, d30, d0 vmlal.u8 q12, d28, d0 vqmovun.s16 d16, q8 vqmovun.s16 d17, q9 vqmovun.s16 d18, q10 vqmovun.s16 d19, q11 vqmovun.s16 d20, q12 vst1.8 {d16-d17}, [r0,:128]! vst1.32 {d20[0]}, [r0,:32], r1 vst1.8 {d18-d19}, [r0,:128]! vst1.32 {d20[1]}, [r0,:32], r1 bgt weight20_nodenom_loop pop {r4-r5,pc} endfunc function mc_weight_w16_nodenom_neon weight_prologue nodenom weight16_nodenom_loop: subs ip, #2 vld1.8 {d16-d17}, [r2], r3 vld1.8 {d18-d19}, [r2], r3 vmov q12, q1 vmov q13, q1 vmov q14, q1 vmov q15, q1 vmlal.u8 q12, d16, d0 vmlal.u8 q13, d17, d0 vmlal.u8 q14, d18, d0 vmlal.u8 q15, d19, d0 vqmovun.s16 d16, q12 vqmovun.s16 d17, q13 vqmovun.s16 d18, q14 vqmovun.s16 d19, q15 vst1.8 {d16-d17}, [r0,:128], r1 vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_nodenom_loop pop {r4-r5,pc} endfunc function mc_weight_w8_nodenom_neon weight_prologue nodenom weight8_nodenom_loop: subs ip, #2 vld1.8 {d16}, [r2], r3 vld1.8 {d18}, [r2], r3 vmov q10, q1 vmov q11, q1 vmlal.u8 q10, d16, d0 vmlal.u8 q11, d18, d0 vqmovun.s16 d16, q10 vqmovun.s16 d17, q11 vst1.8 {d16}, [r0,:64], r1 vst1.8 {d17}, [r0,:64], r1 bgt weight8_nodenom_loop pop {r4-r5,pc} endfunc function mc_weight_w4_nodenom_neon weight_prologue nodenom weight4_nodenom_loop: subs ip, #2 vld1.32 {d16[0]}, [r2], r3 vld1.32 {d16[1]}, [r2], r3 vmov q10, q1 vmlal.u8 q10, d16, d0 vqmovun.s16 d16, q10 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d16[1]}, [r0], r1 bgt weight4_nodenom_loop pop {r4-r5,pc} endfunc .macro weight_simple_prologue push {lr} ldr lr, [sp, #4] // weight_t ldr ip, [sp, #8] // h ldr lr, [lr] // offset vdup.8 q1, lr .endm .macro weight_simple name op function mc_weight_w20_\name\()_neon weight_simple_prologue weight20_\name\()_loop: subs ip, #2 vld1.8 {d16-d18}, [r2], r3 vld1.8 {d19-d21}, [r2], r3 \op q8, q8, q1 \op q9, q9, q1 \op q10, q10, q1 vst1.8 {d16-d18}, [r0,:64], r1 vst1.8 {d19-d21}, [r0,:64], r1 bgt weight20_\name\()_loop pop {pc} endfunc function mc_weight_w16_\name\()_neon weight_simple_prologue weight16_\name\()_loop: subs ip, #2 vld1.8 {d16-d17}, [r2], r3 vld1.8 {d18-d19}, [r2], r3 \op q8, q8, q1 \op q9, q9, q1 vst1.8 {d16-d17}, [r0,:128], r1 vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_\name\()_loop pop {pc} endfunc function mc_weight_w8_\name\()_neon weight_simple_prologue weight8_\name\()_loop: subs ip, #2 vld1.8 {d16}, [r2], r3 vld1.8 {d17}, [r2], r3 \op q8, q8, q1 vst1.8 {d16}, [r0,:64], r1 vst1.8 {d17}, [r0,:64], r1 bgt weight8_\name\()_loop pop {pc} endfunc function mc_weight_w4_\name\()_neon weight_simple_prologue weight4_\name\()_loop: subs ip, #2 vld1.32 {d16[]}, [r2], r3 vld1.32 {d17[]}, [r2], r3 \op q8, q8, q1 vst1.32 {d16[0]}, [r0], r1 vst1.32 {d17[0]}, [r0], r1 bgt weight4_\name\()_loop pop {pc} endfunc .endm weight_simple offsetadd, vqadd.u8 weight_simple offsetsub, vqsub.u8 // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) function mc_copy_w4_neon ldr ip, [sp] copy_w4_loop: subs ip, ip, #4 vld1.32 {d0[]}, [r2], r3 vld1.32 {d1[]}, [r2], r3 vld1.32 {d2[]}, [r2], r3 vld1.32 {d3[]}, [r2], r3 vst1.32 {d0[0]}, [r0,:32], r1 vst1.32 {d1[0]}, [r0,:32], r1 vst1.32 {d2[0]}, [r0,:32], r1 vst1.32 {d3[0]}, [r0,:32], r1 bgt copy_w4_loop bx lr endfunc function mc_copy_w8_neon ldr ip, [sp] copy_w8_loop: subs ip, ip, #4 vld1.32 {d0}, [r2], r3 vld1.32 {d1}, [r2], r3 vld1.32 {d2}, [r2], r3 vld1.32 {d3}, [r2], r3 vst1.32 {d0}, [r0,:64], r1 vst1.32 {d1}, [r0,:64], r1 vst1.32 {d2}, [r0,:64], r1 vst1.32 {d3}, [r0,:64], r1 bgt copy_w8_loop bx lr endfunc function mc_copy_w16_neon ldr ip, [sp] copy_w16_loop: subs ip, ip, #4 vld1.32 {d0-d1}, [r2], r3 vld1.32 {d2-d3}, [r2], r3 vld1.32 {d4-d5}, [r2], r3 vld1.32 {d6-d7}, [r2], r3 vst1.32 {d0-d1}, [r0,:128], r1 vst1.32 {d2-d3}, [r0,:128], r1 vst1.32 {d4-d5}, [r0,:128], r1 vst1.32 {d6-d7}, [r0,:128], r1 bgt copy_w16_loop bx lr endfunc function mc_copy_w16_aligned_neon ldr ip, [sp] copy_w16_aligned_loop: subs ip, ip, #4 vld1.32 {d0-d1}, [r2,:128], r3 vld1.32 {d2-d3}, [r2,:128], r3 vld1.32 {d4-d5}, [r2,:128], r3 vld1.32 {d6-d7}, [r2,:128], r3 vst1.32 {d0-d1}, [r0,:128], r1 vst1.32 {d2-d3}, [r0,:128], r1 vst1.32 {d4-d5}, [r0,:128], r1 vst1.32 {d6-d7}, [r0,:128], r1 bgt copy_w16_aligned_loop bx lr endfunc // void mc_chroma( uint8_t *dst, intptr_t i_dst_stride, // uint8_t *src, intptr_t i_src_stride, // int dx, int dy, int i_width, int i_height ); function mc_chroma_neon push {r4-r8, lr} vpush {d8-d11} ldrd r4, r5, [sp, #56] ldrd r6, r7, [sp, #64] asr lr, r6, #3 mul lr, r4, lr add r3, r3, r5, asr #2 cmp r7, #4 and r5, r5, #7 and r6, r6, #7 add r3, r3, lr bic r3, r3, #0x1 pld [r3] pld [r3, r4] bgt mc_chroma_w8 beq mc_chroma_w4 .macro CHROMA_MC_START r00, r01, r10, r11 muls lr, r5, r6 rsb r7, lr, r6, lsl #3 rsb ip, lr, r5, lsl #3 sub r5, lr, r5, lsl #3 sub r5, r5, r6, lsl #3 add r5, r5, #64 beq 2f vld2.8 {\r00-\r01}, [r3], r4 vdup.8 d0, r5 vdup.8 d1, ip vdup.8 d2, r7 vld2.8 {\r10-\r11}, [r3], r4 vdup.8 d3, lr ldr r5, [sp, #72] .endm .macro CHROMA_MC width, align mc_chroma_w\width: CHROMA_MC_START d4, d5, d8, d9 vext.8 d6, d4, d6, #1 vext.8 d7, d5, d7, #1 vext.8 d10, d8, d10, #1 vext.8 d11, d9, d11, #1 // since the element size varies, there's a different index for the 2nd store .if \width == 4 .set st2, 1 .else .set st2, 2 .endif vtrn.32 d4, d6 vtrn.32 d5, d7 vtrn.32 d8, d10 vtrn.32 d9, d11 vtrn.32 d0, d1 vtrn.32 d2, d3 1: // height loop, interpolate xy vmull.u8 q8, d4, d0 vmlal.u8 q8, d8, d2 vmull.u8 q9, d5, d0 vmlal.u8 q9, d9, d2 vld2.8 {d4-d5}, [r3], r4 vext.8 d6, d4, d6, #1 vext.8 d7, d5, d7, #1 vadd.i16 d16, d16, d17 vadd.i16 d17, d18, d19 vtrn.32 d4, d6 vtrn.32 d5, d7 vmull.u8 q10, d8, d0 vmlal.u8 q10, d4, d2 vmull.u8 q11, d9, d0 vmlal.u8 q11, d5, d2 vld2.8 {d8-d9}, [r3], r4 vrshrn.u16 d16, q8, #6 vext.8 d10, d8, d10, #1 vext.8 d11, d9, d11, #1 vadd.i16 d18, d20, d21 vadd.i16 d19, d22, d23 vtrn.32 d8, d10 vtrn.32 d9, d11 vrshrn.u16 d18, q9, #6 subs r5, r5, #2 pld [r3] pld [r3, r4] vst1.\align {d16[0]}, [r0,:\align], r2 vst1.\align {d16[st2]}, [r1,:\align], r2 vst1.\align {d18[0]}, [r0,:\align], r2 vst1.\align {d18[st2]}, [r1,:\align], r2 bgt 1b vpop {d8-d11} pop {r4-r8, pc} 2: // dx or dy are 0 tst r7, r7 add ip, ip, r7 vdup.8 d0, r5 ldr r5, [sp, #72] vdup.8 d1, ip beq 4f vld1.64 {d4}, [r3], r4 vld1.64 {d6}, [r3], r4 3: // vertical interpolation loop vmull.u8 q8, d4, d0 vmlal.u8 q8, d6, d1 vmull.u8 q9, d6, d0 vld1.64 {d4}, [r3], r4 vmlal.u8 q9, d4, d1 vld1.64 {d6}, [r3], r4 vrshrn.u16 d16, q8, #6 // uvuvuvuv vrshrn.u16 d17, q9, #6 // uvuvuvuv subs r5, r5, #2 vuzp.8 d16, d17 // d16=uuuu|uuuu, d17=vvvv|vvvv pld [r3] pld [r3, r4] vst1.\align {d16[0]}, [r0,:\align], r2 vst1.\align {d16[st2]}, [r0,:\align], r2 vst1.\align {d17[0]}, [r1,:\align], r2 vst1.\align {d17[st2]}, [r1,:\align], r2 bgt 3b vpop {d8-d11} pop {r4-r8, pc} 4: // dy is 0 vld1.64 {d4-d5}, [r3], r4 vld1.64 {d6-d7}, [r3], r4 vext.8 d5, d4, d5, #2 vext.8 d7, d6, d7, #2 5: // horizontal interpolation loop vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 vmull.u8 q9, d6, d0 vmlal.u8 q9, d7, d1 subs r5, r5, #2 vld1.64 {d4-d5}, [r3], r4 vld1.64 {d6-d7}, [r3], r4 vext.8 d5, d4, d5, #2 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 vext.8 d7, d6, d7, #2 vuzp.8 d16, d17 pld [r3] pld [r3, r4] vst1.\align {d16[0]}, [r0,:\align], r2 vst1.\align {d16[st2]}, [r0,:\align], r2 vst1.\align {d17[0]}, [r1,:\align], r2 vst1.\align {d17[st2]}, [r1,:\align], r2 bgt 5b vpop {d8-d11} pop {r4-r8, pc} .endm CHROMA_MC 2, 16 CHROMA_MC 4, 32 mc_chroma_w8: CHROMA_MC_START d4, d7, d8, d11 vext.8 d5, d4, d5, #1 vext.8 d9, d8, d9, #1 vext.8 d7, d6, d7, #1 vext.8 d11, d10, d11, #1 1: // height loop, interpolate xy vmull.u8 q8, d4, d0 vmlal.u8 q8, d5, d1 vmlal.u8 q8, d8, d2 vmlal.u8 q8, d9, d3 vmull.u8 q9, d6, d0 vmlal.u8 q9, d7, d1 vmlal.u8 q9, d10, d2 vmlal.u8 q9, d11, d3 vld2.8 {d4-d7}, [r3], r4 vext.8 d5, d4, d5, #1 vext.8 d7, d6, d7, #1 vmull.u8 q10, d8, d0 vmlal.u8 q10, d9, d1 vmlal.u8 q10, d4, d2 vmlal.u8 q10, d5, d3 vmull.u8 q11, d10, d0 vmlal.u8 q11, d11, d1 vmlal.u8 q11, d6, d2 vmlal.u8 q11, d7, d3 subs r5, r5, #2 vld2.8 {d8-d11}, [r3], r4 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 vrshrn.u16 d18, q10, #6 vext.8 d9, d8, d9, #1 vrshrn.u16 d19, q11, #6 vext.8 d11, d10, d11, #1 pld [r3] pld [r3, r4] vst1.64 {d16}, [r0,:64], r2 vst1.64 {d17}, [r1,:64], r2 vst1.64 {d18}, [r0,:64], r2 vst1.64 {d19}, [r1,:64], r2 bgt 1b vpop {d8-d11} pop {r4-r8, pc} 2: // dx or dy are 0 tst r7, r7 add ip, ip, r7 vdup.8 d0, r5 ldr r5, [sp, #72] vdup.8 d1, ip beq 4f vld2.8 {d4-d5}, [r3], r4 vld2.8 {d6-d7}, [r3], r4 3: // vertical interpolation loop vmull.u8 q8, d4, d0 //U vmlal.u8 q8, d6, d1 vmull.u8 q9, d5, d0 //V vmlal.u8 q9, d7, d1 vld2.8 {d4-d5}, [r3], r4 vmull.u8 q10, d6, d0 vmlal.u8 q10, d4, d1 vmull.u8 q11, d7, d0 vmlal.u8 q11, d5, d1 vld2.8 {d6-d7}, [r3], r4 vrshrn.u16 d16, q8, #6 vrshrn.u16 d17, q9, #6 vrshrn.u16 d18, q10, #6 vrshrn.u16 d19, q11, #6 subs r5, r5, #2 pld [r3] pld [r3, r4] vst1.64 {d16}, [r0,:64], r2 vst1.64 {d17}, [r1,:64], r2 vst1.64 {d18}, [r0,:64], r2 vst1.64 {d19}, [r1,:64], r2 bgt 3b vpop {d8-d11} pop {r4-r8, pc} 4: // dy is 0 vld2.8 {d4-d7}, [r3], r4 vld2.8 {d8-d11}, [r3], r4 vext.8 d5, d4, d5, #1 vext.8 d7, d6, d7, #1 vext.8 d9, d8, d9, #1 vext.8 d11, d10, d11, #1 5: // horizontal interpolation loop subs r5, r5, #2 vmull.u8 q8, d4, d0 //U vmlal.u8 q8, d5, d1 vmull.u8 q9, d6, d0 //V vmlal.u8 q9, d7, d1 vld2.8 {d4-d7}, [r3], r4 vmull.u8 q10, d8, d0 vmlal.u8 q10, d9, d1 vmull.u8 q11, d10, d0 vmlal.u8 q11, d11, d1 vld2.8 {d8-d11}, [r3], r4 vext.8 d5, d4, d5, #1 vrshrn.u16 d16, q8, #6 vext.8 d7, d6, d7, #1 vrshrn.u16 d17, q9, #6 vext.8 d9, d8, d9, #1 vrshrn.u16 d18, q10, #6 vext.8 d11, d10, d11, #1 vrshrn.u16 d19, q11, #6 pld [r3] pld [r3, r4] vst1.64 {d16}, [r0,:64], r2 vst1.64 {d17}, [r1,:64], r2 vst1.64 {d18}, [r0,:64], r2 vst1.64 {d19}, [r1,:64], r2 bgt 5b vpop {d8-d11} pop {r4-r8, pc} endfunc // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width ) function hpel_filter_v_neon ldr ip, [sp] sub r1, r1, r3, lsl #1 push {lr} add lr, r1, ip vmov.u8 d30, #5 vmov.u8 d31, #20 filter_v_loop: subs ip, ip, #16 vld1.64 {d0-d1}, [r1,:128], r3 vld1.64 {d2-d3}, [r1,:128], r3 vld1.64 {d4-d5}, [r1,:128], r3 vld1.64 {d6-d7}, [r1,:128], r3 vld1.64 {d16-d17}, [r1,:128], r3 vld1.64 {d18-d19}, [r1,:128], r3 sub r1, lr, ip vaddl.u8 q10, d0, d18 vmlsl.u8 q10, d2, d30 vmlal.u8 q10, d4, d31 vmlal.u8 q10, d6, d31 vmlsl.u8 q10, d16, d30 vaddl.u8 q11, d1, d19 vmlsl.u8 q11, d3, d30 vmlal.u8 q11, d5, d31 vmlal.u8 q11, d7, d31 vmlsl.u8 q11, d17, d30 vqrshrun.s16 d0, q10, #5 vst1.64 {d20-d21}, [r2,:128]! vqrshrun.s16 d1, q11, #5 vst1.64 {d22-d23}, [r2,:128]! vst1.64 {d0-d1}, [r0,:128]! bgt filter_v_loop pop {pc} endfunc // hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); function hpel_filter_c_neon sub r1, #16 vld1.64 {d0-d3}, [r1,:128]! // unrolled 2x: 4% faster filter_c_loop: subs r2, r2, #16 vld1.64 {d4-d7}, [r1,:128]! vext.16 q8, q0, q1, #6 vext.16 q12, q1, q2, #3 vadd.s16 q8, q8, q12 vext.16 q9, q0, q1, #7 vext.16 q11, q1, q2, #2 vadd.s16 q9, q9, q11 vext.16 q10, q1, q2, #1 vext.16 q11, q1, q2, #6 vadd.s16 q10, q1, q10 vsub.s16 q8, q8, q9 // a-b vext.16 q15, q2, q3, #3 vsub.s16 q9, q9, q10 // b-c vext.16 q12, q1, q2, #7 vshr.s16 q8, q8, #2 // (a-b)/4 vadd.s16 q11, q11, q15 vext.16 q14, q2, q3, #2 vsub.s16 q8, q8, q9 // (a-b)/4-b+c vadd.s16 q12, q12, q14 vext.16 q13, q2, q3, #1 vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4 vadd.s16 q13, q2, q13 vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 vsub.s16 q11, q11, q12 // a-b vsub.s16 q12, q12, q13 // b-c vshr.s16 q11, q11, #2 // (a-b)/4 vqrshrun.s16 d30, q8, #6 vsub.s16 q11, q11, q12 // (a-b)/4-b+c vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4 vld1.64 {d0-d3}, [r1,:128]! vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 vext.16 q8, q2, q3, #6 vqrshrun.s16 d31, q11, #6 vext.16 q12, q3, q0, #3 vadd.s16 q8, q8, q12 vext.16 q9, q2, q3, #7 vst1.64 {d30-d31}, [r0,:128]! bxle lr subs r2, r2, #16 vext.16 q11, q3, q0, #2 vadd.s16 q9, q9, q11 vext.16 q10, q3, q0, #1 vext.16 q11, q3, q0, #6 vadd.s16 q10, q3, q10 vsub.s16 q8, q8, q9 // a-b vext.16 q15, q0, q1, #3 vsub.s16 q9, q9, q10 // b-c vext.16 q12, q3, q0, #7 vshr.s16 q8, q8, #2 // (a-b)/4 vadd.s16 q11, q11, q15 vext.16 q14, q0, q1, #2 vsub.s16 q8, q8, q9 // (a-b)/4-b+c vadd.s16 q12, q12, q14 vext.16 q13, q0, q1, #1 vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4 vadd.s16 q13, q0, q13 vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 vsub.s16 q11, q11, q12 // a-b vsub.s16 q12, q12, q13 // b-c vshr.s16 q11, q11, #2 // (a-b)/4 vqrshrun.s16 d30, q8, #6 vsub.s16 q11, q11, q12 // (a-b)/4-b+c vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4 vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 vqrshrun.s16 d31, q11, #6 vst1.64 {d30-d31}, [r0,:128]! bgt filter_c_loop bx lr endfunc // hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); function hpel_filter_h_neon sub r1, #16 vmov.u8 d30, #5 vld1.64 {d0-d3}, [r1,:128]! vmov.u8 d31, #20 // unrolled 3x because it's 5% faster, due to mitigating // the high latency of multiplication and vqrshrun filter_h_loop: subs r2, r2, #16 vld1.64 {d4-d5}, [r1,:128]! vext.8 q8, q0, q1, #14 vext.8 q12, q1, q2, #3 vaddl.u8 q13, d16, d24 vext.8 q9, q0, q1, #15 vaddl.u8 q14, d17, d25 vext.8 q10, q1, q2, #1 vmlal.u8 q13, d2, d31 vmlsl.u8 q13, d18, d30 vext.8 q11, q1, q2, #2 vmlal.u8 q13, d20, d31 vmlsl.u8 q13, d22, d30 vmlsl.u8 q14, d19, d30 vmlal.u8 q14, d3, d31 vmlal.u8 q14, d21, d31 vmlsl.u8 q14, d23, d30 vqrshrun.s16 d6, q13, #5 vld1.64 {d0-d1}, [r1,:128]! vext.8 q8, q1, q2, #14 vext.8 q12, q2, q0, #3 vaddl.u8 q13, d16, d24 vqrshrun.s16 d7, q14, #5 vext.8 q9, q1, q2, #15 vaddl.u8 q14, d17, d25 vst1.64 {d6-d7}, [r0,:128]! bxle lr subs r2, r2, #16 vext.8 q10, q2, q0, #1 vmlal.u8 q13, d4, d31 vmlsl.u8 q13, d18, d30 vext.8 q11, q2, q0, #2 vmlal.u8 q13, d20, d31 vmlsl.u8 q13, d22, d30 vmlsl.u8 q14, d19, d30 vmlal.u8 q14, d5, d31 vmlal.u8 q14, d21, d31 vmlsl.u8 q14, d23, d30 vqrshrun.s16 d6, q13, #5 vld1.64 {d2-d3}, [r1,:128]! vext.8 q8, q2, q0, #14 vext.8 q12, q0, q1, #3 vaddl.u8 q13, d16, d24 vqrshrun.s16 d7, q14, #5 vext.8 q9, q2, q0, #15 vaddl.u8 q14, d17, d25 vst1.64 {d6-d7}, [r0,:128]! bxle lr subs r2, r2, #16 vext.8 q10, q0, q1, #1 vmlal.u8 q13, d0, d31 vmlsl.u8 q13, d18, d30 vext.8 q11, q0, q1, #2 vmlal.u8 q13, d20, d31 vmlsl.u8 q13, d22, d30 vmlsl.u8 q14, d19, d30 vmlal.u8 q14, d1, d31 vmlal.u8 q14, d21, d31 vmlsl.u8 q14, d23, d30 vqrshrun.s16 d6, q13, #5 vqrshrun.s16 d7, q14, #5 vst1.64 {d6-d7}, [r0,:128]! bgt filter_h_loop bx lr endfunc // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, // uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width, // int height ) function frame_init_lowres_core_neon push {r4-r10,lr} vpush {d8-d15} ldrd r4, r5, [sp, #96] ldrd r6, r7, [sp, #104] ldr lr, [sp, #112] sub r10, r6, r7 // dst_stride - width and r10, r10, #~15 lowres_yloop: mov ip, r7 // width mov r6, r0 // src0 add r8, r0, r5 // src1 = src0 + src_stride add r9, r0, r5, lsl #1 // src2 = src1 + src_stride vld2.8 {d8, d10}, [r6,:128]! vld2.8 {d12,d14}, [r8,:128]! vld2.8 {d16,d18}, [r9,:128]! lowres_xloop: subs ip, ip, #16 vld2.8 {d9, d11}, [r6,:128]! vld2.8 {d13,d15}, [r8,:128]! vrhadd.u8 q0, q4, q6 vld2.8 {d17,d19}, [r9,:128]! vrhadd.u8 q5, q5, q7 vld2.8 {d20,d22}, [r6,:128]! vrhadd.u8 q1, q6, q8 vld2.8 {d24,d26}, [r8,:128]! vrhadd.u8 q7, q7, q9 vext.8 q4, q4, q10, #1 vrhadd.u8 q0, q0, q5 vext.8 q6, q6, q12, #1 vrhadd.u8 q1, q1, q7 vld2.8 {d28,d30}, [r9,:128]! vrhadd.u8 q4, q4, q6 vext.8 q8, q8, q14, #1 vrhadd.u8 q6, q6, q8 vst1.64 {d0-d1}, [r1,:128]! vrhadd.u8 q2, q4, q5 vst1.64 {d2-d3}, [r3,:128]! vrhadd.u8 q3, q6, q7 vst1.64 {d4-d5}, [r2,:128]! vst1.64 {d6-d7}, [r4,:128]! ble lowres_xloop_end subs ip, ip, #16 vld2.8 {d21,d23}, [r6,:128]! vld2.8 {d25,d27}, [r8,:128]! vrhadd.u8 q0, q10, q12 vld2.8 {d29,d31}, [r9,:128]! vrhadd.u8 q11, q11, q13 vld2.8 {d8, d10}, [r6,:128]! vrhadd.u8 q1, q12, q14 vld2.8 {d12,d14}, [r8,:128]! vrhadd.u8 q13, q13, q15 vext.8 q10, q10, q4, #1 vrhadd.u8 q0, q0, q11 vext.8 q12, q12, q6, #1 vrhadd.u8 q1, q1, q13 vld2.8 {d16,d18}, [r9,:128]! vrhadd.u8 q10, q10, q12 vext.8 q14, q14, q8, #1 vrhadd.u8 q12, q12, q14 vst1.64 {d0-d1}, [r1,:128]! vrhadd.u8 q2, q10, q11 vst1.64 {d2-d3}, [r3,:128]! vrhadd.u8 q3, q12, q13 vst1.64 {d4-d5}, [r2,:128]! vst1.64 {d6-d7}, [r4,:128]! bgt lowres_xloop lowres_xloop_end: subs lr, lr, #1 add r0, r0, r5, lsl #1 add r1, r1, r10 add r2, r2, r10 add r3, r3, r10 add r4, r4, r10 bgt lowres_yloop vpop {d8-d15} pop {r4-r10,pc} endfunc function load_deinterleave_chroma_fdec_neon mov ip, #FDEC_STRIDE/2 1: vld2.8 {d0-d1}, [r1,:128], r2 subs r3, r3, #1 pld [r1] vst1.8 {d0}, [r0,:64], ip vst1.8 {d1}, [r0,:64], ip bgt 1b bx lr endfunc function load_deinterleave_chroma_fenc_neon mov ip, #FENC_STRIDE/2 1: vld2.8 {d0-d1}, [r1,:128], r2 subs r3, r3, #1 pld [r1] vst1.8 {d0}, [r0,:64], ip vst1.8 {d1}, [r0,:64], ip bgt 1b bx lr endfunc function plane_copy_core_neon push {r4,lr} ldr r4, [sp, #8] ldr lr, [sp, #12] add r12, r4, #15 bic r4, r12, #15 sub r1, r1, r4 sub r3, r3, r4 1: mov r12, r4 16: tst r12, #16 beq 32f subs r12, r12, #16 vld1.8 {q0}, [r2]! vst1.8 {q0}, [r0]! beq 0f 32: subs r12, r12, #32 vld1.8 {q0, q1}, [r2]! vst1.8 {q0, q1}, [r0]! bgt 32b 0: subs lr, lr, #1 add r2, r2, r3 add r0, r0, r1 bgt 1b pop {r4,pc} endfunc function plane_copy_deinterleave_neon push {r4-r7, lr} ldrd r6, r7, [sp, #28] ldrd r4, r5, [sp, #20] add lr, r6, #15 bic lr, lr, #15 sub r1, r1, lr sub r3, r3, lr sub r5, r5, lr, lsl #1 block: vld2.8 {d0-d3}, [r4,:128]! subs lr, lr, #16 vst1.8 {q0}, [r0]! vst1.8 {q1}, [r2]! bgt block add r4, r4, r5 subs r7, r7, #1 add r0, r0, r1 add r2, r2, r3 mov lr, r6 bgt block pop {r4-r7, pc} endfunc function plane_copy_deinterleave_rgb_neon push {r4-r8, r10, r11, lr} ldrd r4, r5, [sp, #32] ldrd r6, r7, [sp, #40] ldr r8, [sp, #48] ldrd r10, r11, [sp, #52] add lr, r10, #7 subs r8, r8, #3 bic lr, lr, #7 sub r7, r7, lr, lsl #1 sub r1, r1, lr sub r3, r3, lr sub r5, r5, lr subne r7, r7, lr, lsl #1 subeq r7, r7, lr bne block4 block3: vld3.8 {d0,d1,d2}, [r6]! subs lr, lr, #8 vst1.8 {d0}, [r0]! vst1.8 {d1}, [r2]! vst1.8 {d2}, [r4]! bgt block3 subs r11, r11, #1 add r0, r0, r1 add r2, r2, r3 add r4, r4, r5 add r6, r6, r7 mov lr, r10 bgt block3 pop {r4-r8, r10, r11, pc} block4: vld4.8 {d0,d1,d2,d3}, [r6]! subs lr, lr, #8 vst1.8 {d0}, [r0]! vst1.8 {d1}, [r2]! vst1.8 {d2}, [r4]! bgt block4 subs r11, r11, #1 add r0, r0, r1 add r2, r2, r3 add r4, r4, r5 add r6, r6, r7 mov lr, r10 bgt block4 pop {r4-r8, r10, r11, pc} endfunc function plane_copy_interleave_core_neon push {r4-r7, lr} ldrd r6, r7, [sp, #28] ldrd r4, r5, [sp, #20] add lr, r6, #15 bic lr, lr, #15 sub r1, r1, lr, lsl #1 sub r3, r3, lr sub r5, r5, lr blocki: vld1.8 {q0}, [r2]! vld1.8 {q1}, [r4]! subs lr, lr, #16 vst2.8 {d0,d2}, [r0]! vst2.8 {d1,d3}, [r0]! bgt blocki subs r7, r7, #1 add r0, r0, r1 add r2, r2, r3 add r4, r4, r5 mov lr, r6 bgt blocki pop {r4-r7, pc} endfunc function plane_copy_swap_core_neon push {r4-r5, lr} ldrd r4, r5, [sp, #12] add lr, r4, #15 bic lr, lr, #15 sub r1, r1, lr, lsl #1 sub r3, r3, lr, lsl #1 1: vld1.8 {q0, q1}, [r2]! subs lr, lr, #16 vrev16.8 q0, q0 vrev16.8 q1, q1 vst1.8 {q0, q1}, [r0]! bgt 1b subs r5, r5, #1 add r0, r0, r1 add r2, r2, r3 mov lr, r4 bgt 1b pop {r4-r5, pc} endfunc function store_interleave_chroma_neon push {lr} ldr lr, [sp, #4] mov ip, #FDEC_STRIDE 1: vld1.8 {d0}, [r2], ip vld1.8 {d1}, [r3], ip subs lr, lr, #1 vst2.8 {d0,d1}, [r0,:128], r1 bgt 1b pop {pc} endfunc .macro integral4h p1, p2 vext.8 d1, \p1, \p2, #1 vext.8 d2, \p1, \p2, #2 vext.8 d3, \p1, \p2, #3 vaddl.u8 q0, \p1, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 q0, q0, q2 .endm function integral_init4h_neon sub r3, r0, r2, lsl #1 vld1.8 {d6, d7}, [r1, :128]! 1: subs r2, r2, #16 vld1.16 {q2}, [r3, :128]! integral4h d6, d7 vld1.8 {d6}, [r1, :64]! vld1.16 {q2}, [r3, :128]! vst1.16 {q0}, [r0, :128]! integral4h d7, d6 vld1.8 {d7}, [r1, :64]! vst1.16 {q0}, [r0, :128]! bgt 1b bx lr endfunc .macro integral8h p1, p2, s vext.8 d1, \p1, \p2, #1 vext.8 d2, \p1, \p2, #2 vext.8 d3, \p1, \p2, #3 vext.8 d4, \p1, \p2, #4 vext.8 d5, \p1, \p2, #5 vext.8 d6, \p1, \p2, #6 vext.8 d7, \p1, \p2, #7 vaddl.u8 q0, \p1, d1 vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q2, q2, q3 vadd.u16 q0, q0, q2 vadd.u16 q0, q0, \s .endm function integral_init8h_neon sub r3, r0, r2, lsl #1 vld1.8 {d16, d17}, [r1, :128]! 1: subs r2, r2, #16 vld1.16 {q9}, [r3, :128]! integral8h d16, d17, q9 vld1.8 {d16}, [r1, :64]! vld1.16 {q9}, [r3, :128]! vst1.16 {q0}, [r0, :128]! integral8h d17, d16, q9 vld1.8 {d17}, [r1, :64]! vst1.16 {q0}, [r0, :128]! bgt 1b bx lr endfunc function integral_init4v_neon push {r4-r5} mov r3, r0 add r4, r0, r2, lsl #3 add r5, r0, r2, lsl #4 sub r2, r2, #8 vld1.16 {q11, q12}, [r3]! vld1.16 {q8, q9}, [r5]! vld1.16 {q13}, [r3]! vld1.16 {q10}, [r5]! 1: subs r2, r2, #16 vld1.16 {q14, q15}, [r4]! vext.8 q0, q11, q12, #8 vext.8 q1, q12, q13, #8 vext.8 q2, q8, q9, #8 vext.8 q3, q9, q10, #8 vsub.u16 q14, q14, q11 vsub.u16 q15, q15, q12 vadd.u16 q0, q0, q11 vadd.u16 q1, q1, q12 vadd.u16 q2, q2, q8 vadd.u16 q3, q3, q9 vst1.16 {q14}, [r1]! vst1.16 {q15}, [r1]! vmov q11, q13 vmov q8, q10 vsub.u16 q0, q2, q0 vsub.u16 q1, q3, q1 vld1.16 {q12, q13}, [r3]! vld1.16 {q9, q10}, [r5]! vst1.16 {q0}, [r0]! vst1.16 {q1}, [r0]! bgt 1b 2: pop {r4-r5} bx lr endfunc function integral_init8v_neon add r2, r0, r1, lsl #4 sub r1, r1, #8 ands r3, r1, #16 - 1 beq 1f subs r1, r1, #8 vld1.16 {q0}, [r0] vld1.16 {q2}, [r2]! vsub.u16 q8, q2, q0 vst1.16 {q8}, [r0]! ble 2f 1: subs r1, r1, #16 vld1.16 {q0, q1}, [r0] vld1.16 {q2, q3}, [r2]! vsub.u16 q8, q2, q0 vsub.u16 q9, q3, q1 vst1.16 {q8}, [r0]! vst1.16 {q9}, [r0]! bgt 1b 2: bx lr endfunc function mbtree_propagate_cost_neon push {r4-r5,lr} ldrd r4, r5, [sp, #12] ldr lr, [sp, #20] vld1.32 {d6[], d7[]}, [r5] 8: subs lr, lr, #8 vld1.16 {q8}, [r1]! vld1.16 {q9}, [r2]! vld1.16 {q10}, [r3]! vld1.16 {q11}, [r4]! vbic.u16 q10, #0xc000 vmin.u16 q10, q9, q10 vmull.u16 q12, d18, d22 @ propagate_intra vmull.u16 q13, d19, d23 @ propagate_intra vsubl.u16 q14, d18, d20 @ propagate_num vsubl.u16 q15, d19, d21 @ propagate_num vmovl.u16 q10, d18 @ propagate_denom vmovl.u16 q11, d19 @ propagate_denom vmovl.u16 q9, d17 vmovl.u16 q8, d16 vcvt.f32.s32 q12, q12 vcvt.f32.s32 q13, q13 vcvt.f32.s32 q14, q14 vcvt.f32.s32 q15, q15 vcvt.f32.s32 q10, q10 vcvt.f32.s32 q11, q11 vrecpe.f32 q0, q10 vrecpe.f32 q1, q11 vcvt.f32.s32 q8, q8 vcvt.f32.s32 q9, q9 vrecps.f32 q10, q0, q10 vrecps.f32 q11, q1, q11 vmla.f32 q8, q12, q3 @ propagate_amount vmla.f32 q9, q13, q3 @ propagate_amount vmul.f32 q0, q0, q10 vmul.f32 q1, q1, q11 vmul.f32 q8, q8, q14 vmul.f32 q9, q9, q15 vmul.f32 q0, q8, q0 vmul.f32 q1, q9, q1 vcvt.s32.f32 q0, q0 vcvt.s32.f32 q1, q1 vqmovn.s32 d0, q0 vqmovn.s32 d1, q1 vst1.16 {q0}, [r0]! bgt 8b pop {r4-r5,pc} endfunc function mbtree_propagate_list_internal_neon vld1.16 {d4[]}, [sp] @ bipred_weight movrel r12, pw_0to15 vmov.u16 q10, #0xc000 vld1.16 {q0}, [r12, :128] @h->mb.i_mb_x,h->mb.i_mb_y ldrh r12, [sp, #4] vmov.u32 q11, #4 vmov.u8 q3, #32 vdup.u16 q8, r12 @ mb_y vzip.u16 q0, q8 ldr r12, [sp, #8] 8: subs r12, r12, #8 vld1.16 {q14}, [r1, :128]! @ propagate_amount vld1.16 {q15}, [r2]! @ lowres_cost vld1.16 {q8, q9}, [r0]! vand q15, q15, q10 vceq.u16 q1, q15, q10 vmull.u16 q12, d28, d4 vmull.u16 q13, d29, d4 vrshrn.u32 d30, q12, #6 vrshrn.u32 d31, q13, #6 vbsl q1, q15, q14 @ if( lists_used == 3 ) @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 vshr.s16 q12, q8, #5 vshr.s16 q13, q9, #5 vuzp.16 q8, q9 @ x & 31, y & 31 vadd.s16 q12, q12, q0 vadd.s16 q0, q0, q11 vmovn.i16 d16, q8 vmovn.i16 d17, q9 vadd.s16 q13, q13, q0 vbic.i16 q8, #128+64+32 vadd.s16 q0, q0, q11 vbic.i16 q8, #(128+64+32)<<8 vst1.16 {q12, q13}, [r3, :128]! vsub.i8 q9, q3, q8 vmull.u8 q12, d17, d16 @ idx3weight = y*x vmull.u8 q14, d19, d16 @ idx1weight = (32-y)*x vmull.u8 q15, d19, d18 @ idx0weight = (32-y)*(32-x) vmull.u8 q13, d17, d18 @ idx2weight = y*(32-x) vmull.u16 q9, d28, d2 @ idx1weight vmull.u16 q8, d29, d3 vmull.u16 q14, d30, d2 @ idx0weight vmull.u16 q15, d31, d3 vrshrn.u32 d18, q9, #10 @ idx1weight vrshrn.u32 d19, q8, #10 vrshrn.u32 d16, q14, #10 @ idx0weight vrshrn.u32 d17, q15, #10 vmull.u16 q14, d24, d2 @ idx3weight vmull.u16 q15, d25, d3 vzip.16 q8, q9 vmull.u16 q12, d26, d2 @ idx2weight vmull.u16 q13, d27, d3 vst1.16 {q8, q9}, [r3, :128]! vrshrn.u32 d19, q15, #10 @ idx3weight vrshrn.u32 d18, q14, #10 vrshrn.u32 d16, q12, #10 @ idx2weight vrshrn.u32 d17, q13, #10 vzip.16 q8, q9 vst1.16 {q8, q9}, [r3, :128]! bge 8b bx lr endfunc @ void mbtree_fix8_pack( int16_t *dst, float *src, int count ) function mbtree_fix8_pack_neon, export=1 subs r3, r2, #8 blt 2f 1: subs r3, r3, #8 vld1.32 {q0,q1}, [r1,:128]! vcvt.s32.f32 q0, q0, #8 vcvt.s32.f32 q1, q1, #8 vqmovn.s32 d4, q0 vqmovn.s32 d5, q1 vrev16.8 q3, q2 vst1.16 {q3}, [r0,:128]! bge 1b 2: adds r3, r3, #8 bxeq lr 3: subs r3, r3, #1 vld1.32 {d0[0]}, [r1]! vcvt.s32.f32 s0, s0, #8 vrev16.8 d0, d0 vst1.16 {d0[0]}, [r0]! bgt 3b bx lr endfunc @ void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) function mbtree_fix8_unpack_neon, export=1 subs r3, r2, #8 blt 2f 1: subs r3, r3, #8 vld1.16 {q0}, [r1,:128]! vrev16.8 q1, q0 vmovl.s16 q0, d2 vmovl.s16 q1, d3 vcvt.f32.s32 q0, q0, #8 vcvt.f32.s32 q1, q1, #8 vst1.32 {q0,q1}, [r0,:128]! bge 1b 2: adds r3, r3, #8 bxeq lr 3: subs r3, r3, #1 vld1.16 {d0[0]}, [r1]! vrev16.8 d0, d0 vmovl.s16 q0, d0 vcvt.f32.s32 d0, d0, #8 vst1.32 {d0[0]}, [r0]! bgt 3b bx lr endfunc x264-master/common/arm/mc-c.c000066400000000000000000000420621502133446700161150ustar00rootroot00000000000000/***************************************************************************** * mc-c.c: arm motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "mc.h" #define x264_prefetch_ref_arm x264_template(prefetch_ref_arm) void x264_prefetch_ref_arm( uint8_t *, intptr_t, int ); #define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm) void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon) void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); #define x264_memzero_aligned_neon x264_template(memzero_aligned_neon) void x264_memzero_aligned_neon( void *dst, size_t n ); #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon) void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon) void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon) void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon) void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon) void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon) void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon) void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon) void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon) void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon) void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon) void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon) void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon) void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon) void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon) void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon) void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); #define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon) void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon) void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); #define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon) void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); #define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon) void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon) void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon) #define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon) #define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon) #define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon) #define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon) #define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon) #define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon) #define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon) #define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon) #define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon) #define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon) #define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon) #define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon) #define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon) #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon) #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon) #if !HIGH_BIT_DEPTH #define MC_WEIGHT(func)\ void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ \ static weight_fn_t mc##func##_wtab_neon[6] =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w8##func##_neon,\ x264_mc_weight_w16##func##_neon,\ x264_mc_weight_w16##func##_neon,\ x264_mc_weight_w20##func##_neon,\ }; MC_WEIGHT() MC_WEIGHT(_nodenom) MC_WEIGHT(_offsetadd) MC_WEIGHT(_offsetsub) #endif #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon) void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon) void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon) void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon) void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_chroma_neon x264_template(mc_chroma_neon) void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon) void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); #define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon) void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int ); #define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon) void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int ); #define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon) void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int ); #define x264_integral_init4h_neon x264_template(integral_init4h_neon) void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t ); #define x264_integral_init4v_neon x264_template(integral_init4v_neon) void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t ); #define x264_integral_init8h_neon x264_template(integral_init8h_neon) void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t ); #define x264_integral_init8v_neon x264_template(integral_init8v_neon) void x264_integral_init8v_neon( uint16_t *, intptr_t ); #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon) void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); #define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon) void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count ); #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon) void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count ); #if !HIGH_BIT_DEPTH static void weight_cache_neon( x264_t *h, x264_weight_t *w ) { if( w->i_scale == 1<i_denom ) { if( w->i_offset < 0 ) { w->weightfn = mc_offsetsub_wtab_neon; w->cachea[0] = -w->i_offset; } else { w->weightfn = mc_offsetadd_wtab_neon; w->cachea[0] = w->i_offset; } } else if( !w->i_denom ) w->weightfn = mc_nodenom_wtab_neon; else w->weightfn = mc_wtab_neon; } static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = { NULL, x264_pixel_avg2_w4_neon, x264_pixel_avg2_w8_neon, x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function x264_pixel_avg2_w16_neon, x264_pixel_avg2_w20_neon, }; static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = { NULL, x264_mc_copy_w4_neon, x264_mc_copy_w8_neon, NULL, x264_mc_copy_w16_neon, }; static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if( (mvy&3) == 3 ) // explicit if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height ); } else if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height ); else mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); } static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if( (mvy&3) == 3 ) // explicit if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg_wtab_neon[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); return dst; } else if( weight->weightfn ) { weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height ); return dst; } else { *i_dst_stride = i_src_stride; return src1; } } static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ) { intptr_t realign = (intptr_t)src & 15; src -= realign; dstv -= realign; dstc -= realign; dsth -= realign; width += realign; while( height-- ) { x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width ); x264_hpel_filter_c_neon( dstc, buf+8, width ); x264_hpel_filter_h_neon( dsth, src, width ); dsth += stride; dstv += stride; dstc += stride; src += stride; } } PLANE_COPY(16, neon) PLANE_COPY_SWAP(16, neon) PLANE_INTERLEAVE(neon) PROPAGATE_LIST(neon) #endif // !HIGH_BIT_DEPTH void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_ARMV6) ) return; #if !HIGH_BIT_DEPTH pf->prefetch_fenc_420 = x264_prefetch_fenc_arm; pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */ pf->prefetch_ref = x264_prefetch_ref_arm; #endif // !HIGH_BIT_DEPTH if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; pf->plane_copy = plane_copy_neon; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = plane_copy_interleave_neon; pf->plane_copy_swap = plane_copy_swap_neon; pf->store_interleave_chroma = x264_store_interleave_chroma_neon; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; pf->weight = mc_wtab_neon; pf->offsetadd = mc_offsetadd_wtab_neon; pf->offsetsub = mc_offsetsub_wtab_neon; pf->weight_cache = weight_cache_neon; pf->mc_chroma = x264_mc_chroma_neon; pf->mc_luma = mc_luma_neon; pf->get_ref = get_ref_neon; pf->hpel_filter = hpel_filter_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; pf->integral_init4h = x264_integral_init4h_neon; pf->integral_init8h = x264_integral_init8h_neon; pf->integral_init4v = x264_integral_init4v_neon; pf->integral_init8v = x264_integral_init8v_neon; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; pf->mbtree_propagate_list = mbtree_propagate_list_neon; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; #endif // !HIGH_BIT_DEPTH // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs #ifndef SYS_MACOSX pf->memcpy_aligned = x264_memcpy_aligned_neon; #endif pf->memzero_aligned = x264_memzero_aligned_neon; } x264-master/common/arm/mc.h000066400000000000000000000025471502133446700157060ustar00rootroot00000000000000/***************************************************************************** * mc.h: arm motion compensation ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_MC_H #define X264_ARM_MC_H #define x264_mc_init_arm x264_template(mc_init_arm) void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf ); #endif x264-master/common/arm/pixel-a.S000066400000000000000000001167301502133446700166210ustar00rootroot00000000000000/***************************************************************************** * pixel.S: arm pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" const mask_array, align=4 .rept 16 .byte 0xff .endr mask_ff: .rept 16 .byte 0 .endr endconst const mask_ac4, align=4 .short 0, -1, -1, -1, 0, -1, -1, -1 endconst const mask_ac8, align=4 .short 0, -1, -1, -1, -1, -1, -1, -1 endconst .text .macro SAD4_ARMV6 h function pixel_sad_4x\h\()_armv6 push {r4-r6,lr} ldr r4, [r2], r3 ldr r5, [r0], r1 ldr r6, [r2], r3 ldr lr, [r0], r1 usad8 ip, r4, r5 .rept (\h - 2)/2 ldr r4, [r2], r3 ldr r5, [r0], r1 usada8 ip, r6, lr, ip ldr r6, [r2], r3 ldr lr, [r0], r1 usada8 ip, r4, r5, ip .endr usada8 r0, r6, lr, ip pop {r4-r6,pc} endfunc .endm SAD4_ARMV6 4 SAD4_ARMV6 8 .macro SAD_START_4 align:vararg vld1.32 {d1[]}, [r2\align], r3 vld1.32 {d0[]}, [r0,:32], r1 vabdl.u8 q8, d0, d1 .endm .macro SAD_4 align:vararg vld1.32 {d1[]}, [r2\align], r3 vld1.32 {d0[]}, [r0,:32], r1 vabal.u8 q8, d0, d1 .endm .macro SAD_START_8 align:vararg vld1.64 {d1}, [r2\align], r3 vld1.64 {d0}, [r0,:64], r1 vabdl.u8 q8, d0, d1 .endm .macro SAD_8 align:vararg vld1.64 {d1}, [r2\align], r3 vld1.64 {d0}, [r0,:64], r1 vabal.u8 q8, d0, d1 .endm .macro SAD_START_16 align:vararg vld1.64 {d2-d3}, [r2\align], r3 vld1.64 {d0-d1}, [r0,:128], r1 vabdl.u8 q8, d0, d2 vld1.64 {d6-d7}, [r2\align], r3 vabdl.u8 q9, d1, d3 vld1.64 {d4-d5}, [r0,:128], r1 .endm .macro SAD_16 align:vararg vabal.u8 q8, d4, d6 vld1.64 {d2-d3}, [r2\align], r3 vabal.u8 q9, d5, d7 vld1.64 {d0-d1}, [r0,:128], r1 vabal.u8 q8, d0, d2 vld1.64 {d6-d7}, [r2\align], r3 vabal.u8 q9, d1, d3 vld1.64 {d4-d5}, [r0,:128], r1 .endm .macro SAD_FUNC w, h, name, align:vararg function pixel_sad\name\()_\w\()x\h\()_neon SAD_START_\w \align .if \w == 16 .rept \h / 2 - 1 SAD_\w \align .endr .else .rept \h - 1 SAD_\w \align .endr .endif .if \w > 8 vabal.u8 q8, d4, d6 vabal.u8 q9, d5, d7 vadd.u16 q8, q8, q9 .endif .if \w > 4 vadd.u16 d16, d16, d17 .endif vpadd.u16 d0, d16, d16 vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr endfunc .endm SAD_FUNC 4, 4 SAD_FUNC 4, 8 SAD_FUNC 8, 4 SAD_FUNC 8, 8 SAD_FUNC 8, 16 SAD_FUNC 16, 8 SAD_FUNC 16, 16 SAD_FUNC 4, 4, _aligned, ,:32 SAD_FUNC 4, 8, _aligned, ,:32 SAD_FUNC 8, 4, _aligned, ,:64 SAD_FUNC 8, 8, _aligned, ,:64 SAD_FUNC 8, 16, _aligned, ,:64 SAD_FUNC 16, 8, _aligned, ,:128 SAD_FUNC 16, 16, _aligned, ,:128 // If dual issue is possible, use additional accumulators to avoid // stalls from vadal's latency. This only matters for aligned. .macro SAD_DUAL_START_8 SAD_START_8 ,:64 vld1.64 {d3}, [r2,:64], r3 vld1.64 {d2}, [r0,:64], r1 vabdl.u8 q9, d2, d3 .endm .macro SAD_DUAL_8 align:vararg vld1.64 {d1}, [r2,:64], r3 vld1.64 {d0}, [r0,:64], r1 vabal.u8 q8, d0, d1 vld1.64 {d3}, [r2,:64], r3 vld1.64 {d2}, [r0,:64], r1 vabal.u8 q9, d2, d3 .endm .macro SAD_DUAL_START_16 SAD_START_16 ,:128 vabdl.u8 q10, d4, d6 vld1.64 {d2-d3}, [r2,:128], r3 vabdl.u8 q11, d5, d7 vld1.64 {d0-d1}, [r0,:128], r1 .endm .macro SAD_DUAL_16 vabal.u8 q8, d0, d2 vld1.64 {d6-d7}, [r2,:128], r3 vabal.u8 q9, d1, d3 vld1.64 {d4-d5}, [r0,:128], r1 vabal.u8 q10, d4, d6 vld1.64 {d2-d3}, [r2,:128], r3 vabal.u8 q11, d5, d7 vld1.64 {d0-d1}, [r0,:128], r1 .endm .macro SAD_DUAL_END_16 vabal.u8 q8, d0, d2 vld1.64 {d6-d7}, [r2,:128], r3 vabal.u8 q9, d1, d3 vld1.64 {d4-d5}, [r0,:128], r1 vabal.u8 q10, d4, d6 vabal.u8 q11, d5, d7 .endm .macro SAD_FUNC_DUAL w, h function pixel_sad_aligned_\w\()x\h\()_neon_dual SAD_DUAL_START_\w .rept \h / 2 - \w / 8 SAD_DUAL_\w .endr .if \w > 8 SAD_DUAL_END_16 vadd.u16 q8, q8, q9 vadd.u16 q9, q10, q11 .endif .if \w > 4 vadd.u16 q8, q8, q9 vadd.u16 d16, d16, d17 .endif vpadd.u16 d0, d16, d16 vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr endfunc .endm SAD_FUNC_DUAL 8, 4 SAD_FUNC_DUAL 8, 8 SAD_FUNC_DUAL 8, 16 SAD_FUNC_DUAL 16, 8 SAD_FUNC_DUAL 16, 16 .macro SAD_X_START_4 x vld1.32 {d0[]}, [r0,:32], lr vld1.32 {d1[]}, [r1], r6 vabdl.u8 q8, d1, d0 vld1.32 {d2[]}, [r2], r6 vabdl.u8 q9, d2, d0 vld1.32 {d3[]}, [r3], r6 vabdl.u8 q10, d3, d0 .if \x == 4 vld1.32 {d4[]}, [r12], r6 vabdl.u8 q11, d4, d0 .endif .endm .macro SAD_X_4 x vld1.32 {d0[]}, [r0,:32], lr vld1.32 {d1[]}, [r1], r6 vabal.u8 q8, d1, d0 vld1.32 {d2[]}, [r2], r6 vabal.u8 q9, d2, d0 vld1.32 {d3[]}, [r3], r6 vabal.u8 q10, d3, d0 .if \x == 4 vld1.32 {d4[]}, [r12], r6 vabal.u8 q11, d4, d0 .endif .endm .macro SAD_X_START_8 x vld1.64 {d0}, [r0,:64], lr vld1.64 {d1}, [r1], r6 vabdl.u8 q8, d1, d0 vld1.64 {d2}, [r2], r6 vabdl.u8 q9, d2, d0 vld1.64 {d3}, [r3], r6 vabdl.u8 q10, d3, d0 .if \x == 4 vld1.64 {d4}, [r12], r6 vabdl.u8 q11, d4, d0 .endif .endm .macro SAD_X_8 x vld1.64 {d0}, [r0,:64], lr vld1.64 {d1}, [r1], r6 vabal.u8 q8, d1, d0 vld1.64 {d2}, [r2], r6 vabal.u8 q9, d2, d0 vld1.64 {d3}, [r3], r6 vabal.u8 q10, d3, d0 .if \x == 4 vld1.64 {d4}, [r12], r6 vabal.u8 q11, d4, d0 .endif .endm .macro SAD_X_START_16 x vld1.64 {d0-d1}, [r0,:128], lr vld1.64 {d2-d3}, [r1], r6 vabdl.u8 q8, d2, d0 vabdl.u8 q12, d3, d1 vld1.64 {d4-d5}, [r2], r6 vabdl.u8 q9, d4, d0 vabdl.u8 q13, d5, d1 vld1.64 {d6-d7}, [r3], r6 vabdl.u8 q10, d6, d0 vabdl.u8 q14, d7, d1 .if \x == 4 vld1.64 {d2-d3}, [r12], r6 vabdl.u8 q11, d2, d0 vabdl.u8 q15, d3, d1 .endif .endm .macro SAD_X_16 x vld1.64 {d0-d1}, [r0,:128], lr vld1.64 {d2-d3}, [r1], r6 vabal.u8 q8, d2, d0 vabal.u8 q12, d3, d1 vld1.64 {d4-d5}, [r2], r6 vabal.u8 q9, d4, d0 vabal.u8 q13, d5, d1 vld1.64 {d6-d7}, [r3], r6 vabal.u8 q10, d6, d0 vabal.u8 q14, d7, d1 .if \x == 4 vld1.64 {d2-d3}, [r12], r6 vabal.u8 q11, d2, d0 vabal.u8 q15, d3, d1 .endif .endm .macro SAD_X_FUNC x, w, h function pixel_sad_x\x\()_\w\()x\h\()_neon push {r6-r7,lr} .if \x == 3 ldrd r6, r7, [sp, #12] .else ldrd r6, r7, [sp, #16] ldr r12, [sp, #12] .endif mov lr, #FENC_STRIDE SAD_X_START_\w \x .rept \h - 1 SAD_X_\w \x .endr // add up the sads .if \w > 8 vadd.u16 q8, q8, q12 vadd.u16 q9, q9, q13 vadd.u16 q10, q10, q14 .if \x == 4 vadd.u16 q11, q11, q15 .endif .endif .if \w > 4 vadd.u16 d16, d16, d17 vadd.u16 d18, d18, d19 vadd.u16 d20, d20, d21 .if \x == 4 vadd.u16 d22, d22, d23 .endif .endif vpadd.u16 d0, d16, d18 vpadd.u16 d1, d20, d22 vpaddl.u16 q0, q0 .if \x == 3 vst1.32 {d0}, [r7]! vst1.32 {d1[0]}, [r7,:32] .else vst1.32 {d0-d1}, [r7] .endif pop {r6-r7,pc} endfunc .endm SAD_X_FUNC 3, 4, 4 SAD_X_FUNC 3, 4, 8 SAD_X_FUNC 3, 8, 4 SAD_X_FUNC 3, 8, 8 SAD_X_FUNC 3, 8, 16 SAD_X_FUNC 3, 16, 8 SAD_X_FUNC 3, 16, 16 SAD_X_FUNC 4, 4, 4 SAD_X_FUNC 4, 4, 8 SAD_X_FUNC 4, 8, 4 SAD_X_FUNC 4, 8, 8 SAD_X_FUNC 4, 8, 16 SAD_X_FUNC 4, 16, 8 SAD_X_FUNC 4, 16, 16 function pixel_vsad_neon subs r2, r2, #2 vld1.8 {q0}, [r0], r1 vld1.8 {q1}, [r0], r1 vabdl.u8 q2, d0, d2 vabdl.u8 q3, d1, d3 ble 2f 1: subs r2, r2, #2 vld1.8 {q0}, [r0], r1 vabal.u8 q2, d2, d0 vabal.u8 q3, d3, d1 vld1.8 {q1}, [r0], r1 blt 2f vabal.u8 q2, d0, d2 vabal.u8 q3, d1, d3 bgt 1b 2: vadd.u16 q0, q2, q3 HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr endfunc function pixel_asd8_neon ldr r12, [sp, #0] sub r12, r12, #2 vld1.8 {d0}, [r0], r1 vld1.8 {d1}, [r2], r3 vld1.8 {d2}, [r0], r1 vld1.8 {d3}, [r2], r3 vsubl.u8 q8, d0, d1 1: subs r12, r12, #2 vld1.8 {d4}, [r0], r1 vld1.8 {d5}, [r2], r3 vsubl.u8 q9, d2, d3 vsubl.u8 q10, d4, d5 vadd.s16 q8, q9 vld1.8 {d2}, [r0], r1 vld1.8 {d3}, [r2], r3 vadd.s16 q8, q10 bgt 1b vsubl.u8 q9, d2, d3 vadd.s16 q8, q9 vpaddl.s16 q8, q8 vpadd.s32 d16, d16, d17 vpadd.s32 d16, d16, d17 vabs.s32 d16, d16 vmov.32 r0, d16[0] bx lr endfunc .macro SSD_START_4 vld1.32 {d16[]}, [r0,:32], r1 vld1.32 {d17[]}, [r2,:32], r3 vsubl.u8 q2, d16, d17 vld1.32 {d16[]}, [r0,:32], r1 vmull.s16 q0, d4, d4 vld1.32 {d17[]}, [r2,:32], r3 .endm .macro SSD_4 vsubl.u8 q2, d16, d17 vld1.32 {d16[]}, [r0,:32], r1 vmlal.s16 q0, d4, d4 vld1.32 {d17[]}, [r2,:32], r3 .endm .macro SSD_END_4 vsubl.u8 q2, d16, d17 vmlal.s16 q0, d4, d4 .endm .macro SSD_START_8 vld1.64 {d16}, [r0,:64], r1 vld1.64 {d17}, [r2,:64], r3 vsubl.u8 q2, d16, d17 vld1.64 {d16}, [r0,:64], r1 vmull.s16 q0, d4, d4 vmlal.s16 q0, d5, d5 vld1.64 {d17}, [r2,:64], r3 .endm .macro SSD_8 vsubl.u8 q2, d16, d17 vld1.64 {d16}, [r0,:64], r1 vmlal.s16 q0, d4, d4 vmlal.s16 q0, d5, d5 vld1.64 {d17}, [r2,:64], r3 .endm .macro SSD_END_8 vsubl.u8 q2, d16, d17 vmlal.s16 q0, d4, d4 vmlal.s16 q0, d5, d5 .endm .macro SSD_START_16 vld1.64 {d16-d17}, [r0,:128], r1 vld1.64 {d18-d19}, [r2,:128], r3 vsubl.u8 q2, d16, d18 vsubl.u8 q3, d17, d19 vld1.64 {d16-d17}, [r0,:128], r1 vmull.s16 q0, d4, d4 vmlal.s16 q0, d5, d5 vld1.64 {d18-d19}, [r2,:128], r3 vmlal.s16 q0, d6, d6 vmlal.s16 q0, d7, d7 .endm .macro SSD_16 vsubl.u8 q2, d16, d18 vsubl.u8 q3, d17, d19 vld1.64 {d16-d17}, [r0,:128], r1 vmlal.s16 q0, d4, d4 vmlal.s16 q0, d5, d5 vld1.64 {d18-d19}, [r2,:128], r3 vmlal.s16 q0, d6, d6 vmlal.s16 q0, d7, d7 .endm .macro SSD_END_16 vsubl.u8 q2, d16, d18 vsubl.u8 q3, d17, d19 vmlal.s16 q0, d4, d4 vmlal.s16 q0, d5, d5 vmlal.s16 q0, d6, d6 vmlal.s16 q0, d7, d7 .endm .macro SSD_FUNC w h function pixel_ssd_\w\()x\h\()_neon SSD_START_\w .rept \h-2 SSD_\w .endr SSD_END_\w vadd.s32 d0, d0, d1 vpadd.s32 d0, d0, d0 vmov.32 r0, d0[0] bx lr endfunc .endm SSD_FUNC 4, 4 SSD_FUNC 4, 8 SSD_FUNC 8, 4 SSD_FUNC 8, 8 SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 function pixel_ssd_nv12_core_neon push {r4-r5} ldrd r4, r5, [sp, #8] add r12, r4, #8 bic r12, r12, #15 vmov.u64 q8, #0 vmov.u64 q9, #0 sub r1, r1, r12, lsl #1 sub r3, r3, r12, lsl #1 1: subs r12, r4, #16 vld2.8 {d0,d1}, [r0]! vld2.8 {d2,d3}, [r2]! vld2.8 {d4,d5}, [r0]! vld2.8 {d6,d7}, [r2]! vsubl.u8 q10, d0, d2 vsubl.u8 q11, d1, d3 vmull.s16 q14, d20, d20 vmull.s16 q15, d22, d22 vsubl.u8 q12, d4, d6 vsubl.u8 q13, d5, d7 vmlal.s16 q14, d21, d21 vmlal.s16 q15, d23, d23 blt 4f beq 3f 2: vmlal.s16 q14, d24, d24 vmlal.s16 q15, d26, d26 vld2.8 {d0,d1}, [r0]! vld2.8 {d2,d3}, [r2]! vmlal.s16 q14, d25, d25 vmlal.s16 q15, d27, d27 subs r12, r12, #16 vsubl.u8 q10, d0, d2 vsubl.u8 q11, d1, d3 vmlal.s16 q14, d20, d20 vmlal.s16 q15, d22, d22 vld2.8 {d4,d5}, [r0]! vld2.8 {d6,d7}, [r2]! vmlal.s16 q14, d21, d21 vmlal.s16 q15, d23, d23 blt 4f vsubl.u8 q12, d4, d6 vsubl.u8 q13, d5, d7 bgt 2b 3: vmlal.s16 q14, d24, d24 vmlal.s16 q15, d26, d26 vmlal.s16 q14, d25, d25 vmlal.s16 q15, d27, d27 4: subs r5, r5, #1 vaddw.s32 q8, q8, d28 vaddw.s32 q9, q9, d30 add r0, r0, r1 add r2, r2, r3 vaddw.s32 q8, q8, d29 vaddw.s32 q9, q9, d31 bgt 1b vadd.u64 d16, d16, d17 vadd.u64 d18, d18, d19 ldrd r4, r5, [sp, #16] vst1.64 {d16}, [r4] vst1.64 {d18}, [r5] pop {r4-r5} bx lr endfunc .macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16 vmull.u8 \qsqr, \dsrc, \dsrc vaddw.u8 q0, q0, \dsrc \vpadal \qsqr_sum, \qsqr_last .endm function pixel_var_8x8_neon vld1.64 {d16}, [r0,:64], r1 vmull.u8 q1, d16, d16 vmovl.u8 q0, d16 vld1.64 {d18}, [r0,:64], r1 vmull.u8 q2, d18, d18 vaddw.u8 q0, q0, d18 vld1.64 {d20}, [r0,:64], r1 VAR_SQR_SUM q1, q1, q3, d20, vpaddl.u16 vld1.64 {d22}, [r0,:64], r1 VAR_SQR_SUM q2, q2, q8, d22, vpaddl.u16 vld1.64 {d24}, [r0,:64], r1 VAR_SQR_SUM q1, q3, q9, d24 vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q8, q10, d26 vld1.64 {d24}, [r0,:64], r1 VAR_SQR_SUM q1, q9, q14, d24 vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q10, q15, d26 b var_end endfunc function pixel_var_8x16_neon vld1.64 {d16}, [r0,:64], r1 vld1.64 {d18}, [r0,:64], r1 vmull.u8 q1, d16, d16 vmovl.u8 q0, d16 vld1.64 {d20}, [r0,:64], r1 vmull.u8 q2, d18, d18 vaddw.u8 q0, q0, d18 mov ip, #12 vld1.64 {d22}, [r0,:64], r1 VAR_SQR_SUM q1, q1, q14, d20, vpaddl.u16 vld1.64 {d16}, [r0,:64], r1 VAR_SQR_SUM q2, q2, q15, d22, vpaddl.u16 1: subs ip, ip, #4 vld1.64 {d18}, [r0,:64], r1 VAR_SQR_SUM q1, q14, q12, d16 vld1.64 {d20}, [r0,:64], r1 VAR_SQR_SUM q2, q15, q13, d18 vld1.64 {d22}, [r0,:64], r1 VAR_SQR_SUM q1, q12, q14, d20 beq 2f vld1.64 {d16}, [r0,:64], r1 VAR_SQR_SUM q2, q13, q15, d22 b 1b 2: VAR_SQR_SUM q2, q13, q15, d22 b var_end endfunc function pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 vmull.u8 q12, d16, d16 vmovl.u8 q0, d16 vmull.u8 q13, d17, d17 vaddw.u8 q0, q0, d17 vld1.64 {d18-d19}, [r0,:128], r1 VAR_SQR_SUM q1, q12, q14, d18, vpaddl.u16 VAR_SQR_SUM q2, q13, q15, d19, vpaddl.u16 mov ip, #7 var16_loop: subs ip, ip, #1 vld1.64 {d16-d17}, [r0,:128], r1 VAR_SQR_SUM q1, q14, q12, d16 VAR_SQR_SUM q2, q15, q13, d17 vld1.64 {d18-d19}, [r0,:128], r1 VAR_SQR_SUM q1, q12, q14, d18 VAR_SQR_SUM q2, q13, q15, d19 bgt var16_loop endfunc function var_end, export=0 vpaddl.u16 q8, q14 vpaddl.u16 q9, q15 vadd.u32 q1, q1, q8 vadd.u16 d0, d0, d1 vadd.u32 q1, q1, q9 vadd.u32 q1, q1, q2 vpaddl.u16 d0, d0 vadd.u32 d2, d2, d3 vpadd.u32 d0, d0, d2 vmov r0, r1, d0 bx lr endfunc .macro DIFF_SUM diff1 diff2 da1 db1 da2 db2 lastdiff1 lastdiff2 acc1 acc2 vld1.64 {\da1}, [r0,:64]! vld1.64 {\db1}, [r1,:64], r3 .ifnb \lastdiff1 vadd.s16 \acc1, \acc1, \lastdiff1 vadd.s16 \acc2, \acc2, \lastdiff2 .endif vld1.64 {\da2}, [r0,:64]! vld1.64 {\db2}, [r1,:64], r3 vsubl.u8 \diff1, \da1, \db1 vsubl.u8 \diff2, \da2, \db2 .endm .macro SQR_ACC_DOUBLE acc1 acc2 d0 d1 d2 d3 vmlal=vmlal.s16 \vmlal \acc1, \d0, \d0 vmlal.s16 \acc1, \d1, \d1 \vmlal \acc2, \d2, \d2 vmlal.s16 \acc2, \d3, \d3 .endm .macro SQR_ACC acc d0 d1 vmlal=vmlal.s16 \vmlal \acc, \d0, \d0 vmlal.s16 \acc, \d1, \d1 .endm function pixel_var2_8x8_neon mov r3, #16 DIFF_SUM q0, q10, d0, d1, d20, d21 DIFF_SUM q8, q11, d16, d17, d22, d23 SQR_ACC_DOUBLE q1, q13, d0, d1, d20, d21, vmull.s16 DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10 SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23, vmull.s16 .rept 2 DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10 SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25 DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10 SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23 .endr DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10 SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25 vadd.s16 q0, q0, q8 vadd.s16 q10, q10, q11 SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23 vadd.s16 d0, d0, d1 vadd.s16 d20, d20, d21 vadd.s32 q1, q1, q2 vadd.s32 q13, q13, q14 vpaddl.s16 d0, d0 vpaddl.s16 d20, d20 vadd.s32 d1, d2, d3 vadd.s32 d26, d26, d27 vpadd.s32 d0, d0, d20 @ sum vpadd.s32 d1, d1, d26 @ sqr vmul.s32 d0, d0, d0 @ sum*sum vshr.s32 d0, d0, #6 vsub.s32 d0, d1, d0 vpadd.s32 d0, d0, d0 vmov r0, r1, d0 vst1.32 {d1}, [r2,:64] bx lr endfunc function pixel_var2_8x16_neon mov r3, #16 vld1.64 {d16}, [r0,:64]! vld1.64 {d17}, [r1,:64], r3 vld1.64 {d18}, [r0,:64]! vld1.64 {d19}, [r1,:64], r3 vsubl.u8 q0, d16, d17 vsubl.u8 q3, d18, d19 SQR_ACC q1, d0, d1, vmull.s16 vld1.64 {d16}, [r0,:64]! mov ip, #15 vld1.64 {d17}, [r1,:64], r3 SQR_ACC q2, d6, d7, vmull.s16 1: subs ip, ip, #1 vld1.64 {d18}, [r0,:64]! vsubl.u8 q10, d16, d17 vld1.64 {d19}, [r1,:64], r3 vadd.s16 q0, q0, q10 SQR_ACC q1, d20, d21 vsubl.u8 q11, d18, d19 beq 2f vld1.64 {d16}, [r0,:64]! vadd.s16 q3, q3, q11 vld1.64 {d17}, [r1,:64], r3 SQR_ACC q2, d22, d23 b 1b 2: vadd.s16 q3, q3, q11 SQR_ACC q2, d22, d23 vadd.s16 d0, d0, d1 vadd.s16 d6, d6, d7 vpaddl.s16 d0, d0 vpaddl.s16 d6, d6 vadd.s32 d2, d2, d3 vadd.s32 d4, d4, d5 vpadd.s32 d0, d0, d6 @ sum vpadd.s32 d2, d2, d4 @ sqr vmul.s32 d0, d0, d0 @ sum*sum vshr.s32 d0, d0, #7 vsub.s32 d0, d2, d0 vpadd.s32 d0, d0, d0 vmov r0, r1, d0 vst1.32 {d2}, [r2,:64] bx lr endfunc .macro LOAD_DIFF_8x4 q0 q1 q2 q3 vld1.32 {d1}, [r2], r3 vld1.32 {d0}, [r0,:64], r1 vsubl.u8 \q0, d0, d1 vld1.32 {d3}, [r2], r3 vld1.32 {d2}, [r0,:64], r1 vsubl.u8 \q1, d2, d3 vld1.32 {d5}, [r2], r3 vld1.32 {d4}, [r0,:64], r1 vsubl.u8 \q2, d4, d5 vld1.32 {d7}, [r2], r3 vld1.32 {d6}, [r0,:64], r1 vsubl.u8 \q3, d6, d7 .endm function pixel_satd_4x4_neon vld1.32 {d1[]}, [r2], r3 vld1.32 {d0[]}, [r0,:32], r1 vld1.32 {d3[]}, [r2], r3 vld1.32 {d2[]}, [r0,:32], r1 vld1.32 {d1[1]}, [r2], r3 vld1.32 {d0[1]}, [r0,:32], r1 vld1.32 {d3[1]}, [r2], r3 vld1.32 {d2[1]}, [r0,:32], r1 vsubl.u8 q0, d0, d1 vsubl.u8 q1, d2, d3 SUMSUB_AB q2, q3, q0, q1 SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7 HADAMARD 1, sumsub, q2, q3, q0, q1 HADAMARD 2, amax, q0,, q2, q3 HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr endfunc function pixel_satd_4x8_neon vld1.32 {d1[]}, [r2], r3 vld1.32 {d0[]}, [r0,:32], r1 vld1.32 {d3[]}, [r2], r3 vld1.32 {d2[]}, [r0,:32], r1 vld1.32 {d5[]}, [r2], r3 vld1.32 {d4[]}, [r0,:32], r1 vld1.32 {d7[]}, [r2], r3 vld1.32 {d6[]}, [r0,:32], r1 vld1.32 {d1[1]}, [r2], r3 vld1.32 {d0[1]}, [r0,:32], r1 vsubl.u8 q0, d0, d1 vld1.32 {d3[1]}, [r2], r3 vld1.32 {d2[1]}, [r0,:32], r1 vsubl.u8 q1, d2, d3 vld1.32 {d5[1]}, [r2], r3 vld1.32 {d4[1]}, [r0,:32], r1 vsubl.u8 q2, d4, d5 vld1.32 {d7[1]}, [r2], r3 SUMSUB_AB q8, q9, q0, q1 vld1.32 {d6[1]}, [r0,:32], r1 vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 b satd_4x8_8x4_end_neon endfunc function pixel_satd_8x4_neon vld1.64 {d1}, [r2], r3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q0, d0, d1 vld1.64 {d3}, [r2], r3 vld1.64 {d2}, [r0,:64], r1 vsubl.u8 q1, d2, d3 vld1.64 {d5}, [r2], r3 vld1.64 {d4}, [r0,:64], r1 vsubl.u8 q2, d4, d5 vld1.64 {d7}, [r2], r3 SUMSUB_AB q8, q9, q0, q1 vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 endfunc function satd_4x8_8x4_end_neon, export=0 vadd.s16 q0, q8, q10 vadd.s16 q1, q9, q11 vsub.s16 q2, q8, q10 vsub.s16 q3, q9, q11 vtrn.16 q0, q1 vadd.s16 q8, q0, q1 vtrn.16 q2, q3 vsub.s16 q9, q0, q1 vadd.s16 q10, q2, q3 vsub.s16 q11, q2, q3 vtrn.32 q8, q10 vabs.s16 q8, q8 vtrn.32 q9, q11 vabs.s16 q10, q10 vabs.s16 q9, q9 vabs.s16 q11, q11 vmax.u16 q0, q8, q10 vmax.u16 q1, q9, q11 vadd.u16 q0, q0, q1 HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr endfunc function pixel_satd_8x8_neon mov ip, lr bl satd_8x8_neon vadd.u16 q0, q12, q13 vadd.u16 q1, q14, q15 vadd.u16 q0, q0, q1 HORIZ_ADD d0, d0, d1 mov lr, ip vmov.32 r0, d0[0] bx lr endfunc function pixel_satd_8x16_neon vpush {d8-d11} mov ip, lr bl satd_8x8_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 bl satd_8x8_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 vadd.u16 q0, q4, q5 HORIZ_ADD d0, d0, d1 vpop {d8-d11} mov lr, ip vmov.32 r0, d0[0] bx lr endfunc function satd_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q12, d6, d7 vld1.64 {d17}, [r2], r3 SUMSUB_AB q2, q3, q10, q11 vld1.64 {d16}, [r0,:64], r1 vsubl.u8 q13, d16, d17 vld1.64 {d19}, [r2], r3 SUMSUB_AB q8, q10, q0, q2 vld1.64 {d18}, [r0,:64], r1 vsubl.u8 q14, d18, d19 vld1.64 {d1}, [r2], r3 SUMSUB_AB q9, q11, q1, q3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q15, d0, d1 endfunc // one vertical hadamard pass and two horizontal function satd_8x4v_8x8h_neon, export=0 SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15 vtrn.16 q8, q9 SUMSUB_AB q12, q14, q0, q2 vtrn.16 q10, q11 SUMSUB_AB q13, q15, q1, q3 SUMSUB_AB q0, q1, q8, q9 vtrn.16 q12, q13 SUMSUB_AB q2, q3, q10, q11 vtrn.16 q14, q15 SUMSUB_AB q8, q9, q12, q13 vtrn.32 q0, q2 SUMSUB_AB q10, q11, q14, q15 vtrn.32 q1, q3 ABS2 q0, q2 vtrn.32 q8, q10 ABS2 q1, q3 vtrn.32 q9, q11 ABS2 q8, q10 ABS2 q9, q11 vmax.s16 q12, q0, q2 vmax.s16 q13, q1, q3 vmax.s16 q14, q8, q10 vmax.s16 q15, q9, q11 bx lr endfunc function pixel_satd_16x8_neon vpush {d8-d11} mov ip, lr bl satd_16x4_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 vadd.u16 q0, q4, q5 HORIZ_ADD d0, d0, d1 vpop {d8-d11} mov lr, ip vmov.32 r0, d0[0] bx lr endfunc function pixel_satd_16x16_neon vpush {d8-d11} mov ip, lr bl satd_16x4_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 bl satd_16x4_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 vadd.u16 q5, q5, q15 vadd.u16 q0, q4, q5 HORIZ_ADD d0, d0, d1 vpop {d8-d11} mov lr, ip vmov.32 r0, d0[0] bx lr endfunc function satd_16x4_neon, export=0 vld1.64 {d2-d3}, [r2], r3 vld1.64 {d0-d1}, [r0,:128], r1 vsubl.u8 q8, d0, d2 vld1.64 {d6-d7}, [r2], r3 vsubl.u8 q12, d1, d3 vld1.64 {d4-d5}, [r0,:128], r1 vsubl.u8 q9, d4, d6 vld1.64 {d2-d3}, [r2], r3 vsubl.u8 q13, d5, d7 vld1.64 {d0-d1}, [r0,:128], r1 vsubl.u8 q10, d0, d2 vld1.64 {d6-d7}, [r2], r3 vsubl.u8 q14, d1, d3 vadd.s16 q0, q8, q9 vld1.64 {d4-d5}, [r0,:128], r1 vsub.s16 q1, q8, q9 vsubl.u8 q11, d4, d6 vsubl.u8 q15, d5, d7 SUMSUB_AB q2, q3, q10, q11 SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3 b satd_8x4v_8x8h_neon endfunc function pixel_sa8d_8x8_neon mov ip, lr bl sa8d_8x8_neon vadd.u16 q0, q8, q9 HORIZ_ADD d0, d0, d1 mov lr, ip vmov.32 r0, d0[0] add r0, r0, #1 lsr r0, r0, #1 bx lr endfunc function pixel_sa8d_16x16_neon vpush {d8-d11} mov ip, lr bl sa8d_8x8_neon vpaddl.u16 q4, q8 vpaddl.u16 q5, q9 bl sa8d_8x8_neon vpadal.u16 q4, q8 vpadal.u16 q5, q9 sub r0, r0, r1, lsl #4 sub r2, r2, r3, lsl #4 add r0, r0, #8 add r2, r2, #8 bl sa8d_8x8_neon vpadal.u16 q4, q8 vpadal.u16 q5, q9 bl sa8d_8x8_neon vpaddl.u16 q8, q8 vpaddl.u16 q9, q9 vadd.u32 q0, q4, q8 vadd.u32 q1, q5, q9 vadd.u32 q0, q0, q1 vadd.u32 d0, d0, d1 vpadd.u32 d0, d0, d0 vpop {d8-d11} mov lr, ip vmov.32 r0, d0[0] add r0, r0, #1 lsr r0, r0, #1 bx lr endfunc .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 .endm .macro integrated_satd dst, s0, s1, s2, s3 vmov q0, \s0 vmov q1, \s1 vmov q2, \s2 vmov q3, \s3 vtrn.16 q0, q1 vtrn.16 q2, q3 SUMSUB_AB q6, q7, q0, q1 SUMSUB_AB q0, q1, q2, q3 vtrn.32 q6, q0 vtrn.32 q7, q1 vabs.s16 q6, q6 vabs.s16 q0, q0 vabs.s16 q7, q7 vabs.s16 q1, q1 vmax.u16 q6, q6, q0 vmax.u16 q7, q7, q1 vadd.i16 q6, q6, q7 vpadal.u16 \dst, q6 .endm .macro sa8d_satd_8x8 satd= function sa8d_\satd\()8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q12, d6, d7 vld1.64 {d17}, [r2], r3 SUMSUB_AB q2, q3, q10, q11 vld1.64 {d16}, [r0,:64], r1 vsubl.u8 q13, d16, d17 vld1.64 {d19}, [r2], r3 SUMSUB_AB q8, q10, q0, q2 vld1.64 {d18}, [r0,:64], r1 vsubl.u8 q14, d18, d19 vld1.64 {d1}, [r2], r3 SUMSUB_AB q9, q11, q1, q3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q15, d0, d1 HADAMARD4_V q12, q13, q14, q15, q0, q1, q2, q3 .ifc \satd, satd_ integrated_satd q4, q8, q9, q10, q11 integrated_satd q4, q12, q13, q14, q15 .endif SUMSUB_ABCD q0, q8, q1, q9, q8, q12, q9, q13 SUMSUB_AB q2, q10, q10, q14 vtrn.16 q8, q9 SUMSUB_AB q3, q11, q11, q15 vtrn.16 q0, q1 SUMSUB_AB q12, q13, q8, q9 vtrn.16 q10, q11 SUMSUB_AB q8, q9, q0, q1 vtrn.16 q2, q3 SUMSUB_AB q14, q15, q10, q11 vadd.i16 q10, q2, q3 vtrn.32 q12, q14 vsub.i16 q11, q2, q3 vtrn.32 q13, q15 SUMSUB_AB q0, q2, q12, q14 vtrn.32 q8, q10 SUMSUB_AB q1, q3, q13, q15 vtrn.32 q9, q11 SUMSUB_AB q12, q14, q8, q10 SUMSUB_AB q13, q15, q9, q11 vswp d1, d24 ABS2 q0, q12 vswp d3, d26 ABS2 q1, q13 vswp d5, d28 ABS2 q2, q14 vswp d7, d30 ABS2 q3, q15 vmax.s16 q8, q0, q12 vmax.s16 q9, q1, q13 vmax.s16 q10, q2, q14 vmax.s16 q11, q3, q15 vadd.i16 q8, q8, q9 vadd.i16 q9, q10, q11 .ifc \satd, satd_ vpadal.u16 q5, q8 vpadal.u16 q5, q9 .endif bx lr endfunc .endm sa8d_satd_8x8 sa8d_satd_8x8 satd_ function pixel_sa8d_satd_16x16_neon push {lr} vpush {q4-q7} vmov.u32 q4, #0 vmov.u32 q5, #0 bl sa8d_satd_8x8_neon bl sa8d_satd_8x8_neon sub r0, r0, r1, lsl #4 sub r2, r2, r3, lsl #4 add r0, r0, #8 add r2, r2, #8 bl sa8d_satd_8x8_neon bl sa8d_satd_8x8_neon vadd.u32 d1, d10, d11 vadd.u32 d0, d8, d9 vpadd.u32 d1, d1, d1 vpadd.u32 d0, d0, d0 vrshr.u32 d1, d1, #1 vmov.32 r1, d0[0] vmov.32 r0, d1[0] vpop {q4-q7} pop {pc} endfunc .macro HADAMARD_AC w h function pixel_hadamard_ac_\w\()x\h\()_neon vpush {d8-d15} movrel ip, mask_ac4 vmov.i8 q4, #0 // note: this assumes mask_ac8 is after mask_ac4 (so don't move it) vld1.64 {d12-d15}, [ip,:128] vmov.i8 q5, #0 mov ip, lr bl hadamard_ac_8x8_neon .if \h > 8 bl hadamard_ac_8x8_neon .endif .if \w > 8 sub r0, r0, r1, lsl #3 add r0, r0, #8 bl hadamard_ac_8x8_neon .endif .if \w * \h == 256 sub r0, r0, r1, lsl #4 bl hadamard_ac_8x8_neon .endif vadd.s32 d8, d8, d9 vadd.s32 d10, d10, d11 vpadd.s32 d0, d8, d10 vpop {d8-d15} mov lr, ip vmov r0, r1, d0 lsr r0, r0, #1 lsr r1, r1, #2 bx lr endfunc .endm HADAMARD_AC 8, 8 HADAMARD_AC 8, 16 HADAMARD_AC 16, 8 HADAMARD_AC 16, 16 // q4: satd q5: sa8d q6: mask_ac4 q7: mask_ac8 function hadamard_ac_8x8_neon, export=0 vld1.64 {d2}, [r0,:64], r1 vld1.64 {d3}, [r0,:64], r1 vaddl.u8 q0, d2, d3 vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q1, d2, d3 vld1.64 {d7}, [r0,:64], r1 vaddl.u8 q2, d6, d7 vld1.64 {d18}, [r0,:64], r1 vsubl.u8 q3, d6, d7 vld1.64 {d19}, [r0,:64], r1 vaddl.u8 q8, d18, d19 vld1.64 {d22}, [r0,:64], r1 vsubl.u8 q9, d18, d19 vld1.64 {d23}, [r0,:64], r1 SUMSUB_ABCD q12, q14, q13, q15, q0, q2, q1, q3 vaddl.u8 q10, d22, d23 vsubl.u8 q11, d22, d23 vtrn.16 q12, q13 SUMSUB_ABCD q0, q2, q1, q3, q8, q10, q9, q11 vtrn.16 q14, q15 SUMSUB_AB q8, q9, q12, q13 vtrn.16 q0, q1 SUMSUB_AB q10, q11, q14, q15 vtrn.16 q2, q3 SUMSUB_AB q12, q13, q0, q1 vtrn.32 q8, q10 SUMSUB_AB q14, q15, q2, q3 vtrn.32 q9, q11 SUMSUB_AB q0, q2, q8, q10 vtrn.32 q12, q14 SUMSUB_AB q1, q3, q9, q11 vtrn.32 q13, q15 SUMSUB_ABCD q8, q10, q9, q11, q12, q14, q13, q15 vabs.s16 q12, q0 vabs.s16 q13, q8 vabs.s16 q15, q1 vadd.s16 q12, q12, q13 vabs.s16 q14, q2 vand.s16 q12, q12, q6 vabs.s16 q13, q3 vadd.s16 q12, q12, q15 vabs.s16 q15, q9 vadd.s16 q12, q12, q14 vabs.s16 q14, q10 vadd.s16 q12, q12, q13 vabs.s16 q13, q11 vadd.s16 q12, q12, q15 vsub.s16 q15, q11, q3 vadd.s16 q12, q12, q14 vadd.s16 q14, q11, q3 vadd.s16 q12, q12, q13 vsub.s16 q13, q10, q2 vadd.s16 q2, q10, q2 vpadal.u16 q4, q12 SUMSUB_AB q10, q11, q9, q1 SUMSUB_AB q9, q8, q0, q8 vswp d29, d30 vabs.s16 q14, q14 vabs.s16 q15, q15 vswp d5, d26 vabs.s16 q2, q2 vabs.s16 q13, q13 vswp d21, d22 vabs.s16 q10, q10 vabs.s16 q11, q11 vmax.s16 q3, q14, q15 vmax.s16 q2, q2, q13 vmax.s16 q1, q10, q11 vswp d19, d16 SUMSUB_AB q14, q15, q9, q8 vadd.s16 q2, q2, q3 vadd.s16 q2, q2, q1 vand q14, q14, q7 vadd.s16 q2, q2, q2 vabs.s16 q15, q15 vabs.s16 q14, q14 vadd.s16 q2, q2, q15 vadd.s16 q2, q2, q14 vpadal.u16 q5, q2 bx lr endfunc .macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext vld1.64 {\db}, [r2], r3 vmull.u8 \ssa, \da, \da vmull.u8 \s12, \da, \db .if \n == 1 vpaddl.u16 q2, \lastssa vpaddl.u16 q3, \lasts12 vaddl.u8 q0, d0, \da .else vpadal.u16 q2, \lastssa vpadal.u16 q3, \lasts12 vaddw.u8 q0, q0, \da .endif vpadal.u16 q2, \lastssb .if \n < 3 vld1.64 {\dnext}, [r0], r1 .endif .if \n == 1 vaddl.u8 q1, d2, \db .else vaddw.u8 q1, q1, \db .endif vmull.u8 \ssb, \db, \db .endm function pixel_ssim_4x4x2_core_neon ldr ip, [sp] vld1.64 {d0}, [r0], r1 vld1.64 {d2}, [r2], r3 vmull.u8 q2, d0, d0 vmull.u8 q3, d0, d2 vld1.64 {d28}, [r0], r1 vmull.u8 q15, d2, d2 SSIM_ITER 1, q8, q9, q14, q2, q3, q15, d28, d29, d26 SSIM_ITER 2, q10,q11,q13, q8, q9, q14, d26, d27, d28 SSIM_ITER 3, q8, q9, q15, q10,q11,q13, d28, d29 vpadal.u16 q2, q8 vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 vpadal.u16 q2, q15 vpadal.u16 q3, q9 vpadd.u32 d0, d0, d1 vpadd.u32 d1, d2, d3 vpadd.u32 d2, d4, d5 vpadd.u32 d3, d6, d7 vst4.32 {d0-d3}, [ip] bx lr endfunc // FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2 function pixel_ssim_end4_neon vld1.32 {d16-d19}, [r0,:128]! vld1.32 {d20-d23}, [r1,:128]! vadd.s32 q0, q8, q10 vadd.s32 q1, q9, q11 vld1.32 {d24-d27}, [r0,:128]! vadd.s32 q0, q0, q1 vld1.32 {d28-d31}, [r1,:128]! vadd.s32 q2, q12, q14 vadd.s32 q3, q13, q15 vld1.32 {d16-d17}, [r0,:128] vadd.s32 q1, q1, q2 vld1.32 {d18-d19}, [r1,:128] vadd.s32 q8, q8, q9 vadd.s32 q2, q2, q3 vadd.s32 q3, q3, q8 vtrn.32 q0, q1 vtrn.32 q2, q3 vswp d1, d4 vswp d3, d6 // s1=q0, s2=q1, ss=q2, s12=q3 vmul.s32 q8, q0, q1 // s1*s2 vmul.s32 q0, q0, q0 vmla.s32 q0, q1, q1 // s1*s1 + s2*s2 vshl.s32 q3, q3, #7 vshl.s32 q2, q2, #6 vadd.s32 q1, q8, q8 mov r3, #416 // ssim_c1 = .01*.01*255*255*64 movconst ip, 235963 // ssim_c2 = .03*.03*255*255*64*63 vdup.32 q14, r3 vdup.32 q15, ip vsub.s32 q2, q2, q0 // vars vsub.s32 q3, q3, q1 // covar*2 vadd.s32 q0, q0, q14 vadd.s32 q2, q2, q15 vadd.s32 q1, q1, q14 vadd.s32 q3, q3, q15 vcvt.f32.s32 q0, q0 vcvt.f32.s32 q2, q2 vcvt.f32.s32 q1, q1 vcvt.f32.s32 q3, q3 vmul.f32 q0, q0, q2 vmul.f32 q1, q1, q3 cmp r2, #4 vdiv.f32 s0, s4, s0 vdiv.f32 s1, s5, s1 vdiv.f32 s2, s6, s2 vdiv.f32 s3, s7, s3 beq ssim_skip movrel r3, mask_ff sub r3, r3, r2, lsl #2 vld1.64 {d6-d7}, [r3] vand q0, q0, q3 ssim_skip: vadd.f32 d0, d0, d1 vpadd.f32 d0, d0, d0 vmov.32 r0, d0[0] bx lr endfunc x264-master/common/arm/pixel.h000066400000000000000000000222761502133446700164310ustar00rootroot00000000000000/***************************************************************************** * pixel.h: arm pixel metrics ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_PIXEL_H #define X264_ARM_PIXEL_H #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon) #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon) #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon) #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon) #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon) #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon) #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon) #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon) #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon) #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon) #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon) #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon) #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon) #define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon) #define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon) #define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6) #define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon) #define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6) #define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon) #define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon) #define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon) #define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon) #define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon) #define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual) #define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon) #define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual) #define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon) #define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon) #define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon) #define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual) #define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon) #define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual) #define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon) #define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual) #define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon) #define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon) #define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon) #define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon) #define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon) #define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon) #define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon) #define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon) #define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon) #define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon) #define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon) #define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon) #define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon) #define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon) #define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon) #define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon) #define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon) #define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon) #define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon) #define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon) #define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon) #define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon) #define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon) #define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon) #define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon) #define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon) #define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon) #define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon) #define DECL_PIXELS( ret, name, suffix, args ) \ ret x264_pixel_##name##_16x16_##suffix args;\ ret x264_pixel_##name##_16x8_##suffix args;\ ret x264_pixel_##name##_8x16_##suffix args;\ ret x264_pixel_##name##_8x8_##suffix args;\ ret x264_pixel_##name##_8x4_##suffix args;\ ret x264_pixel_##name##_4x8_##suffix args;\ ret x264_pixel_##name##_4x4_##suffix args;\ #define DECL_X1( name, suffix ) \ DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) ) #define DECL_X4( name, suffix ) \ DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\ DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) ) int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t ); int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t ); DECL_X1( sad, neon ) DECL_X1( sad_aligned, neon ) DECL_X1( sad_aligned, neon_dual ) DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) #define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon) void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * ); #define x264_pixel_vsad_neon x264_template(pixel_vsad_neon) int x264_pixel_vsad_neon( uint8_t *, intptr_t, int ); #define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon) int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); #define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon) int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); #define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon) uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); #define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon) uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); #define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon) uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); #define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon) uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); #define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon) int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); #define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon) int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); #define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon) uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); #define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon) uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); #define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon) uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t ); #define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon) uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t ); #define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon) void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, const uint8_t *, intptr_t, int sums[2][4] ); #define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon) float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); #define x264_pixel_asd8_neon x264_template(pixel_asd8_neon) int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #endif x264-master/common/arm/predict-a.S000066400000000000000000000531411502133446700171260ustar00rootroot00000000000000/***************************************************************************** * predict.S: arm intra prediction ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Mans Rullgard * Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" const p16weight, align=4 .short 1,2,3,4,5,6,7,8 endconst .text .macro ldcol.8 rd, rs, rt, n=8, hi=0 .if \n == 8 || \hi == 0 vld1.8 {\rd[0]}, [\rs], \rt vld1.8 {\rd[1]}, [\rs], \rt vld1.8 {\rd[2]}, [\rs], \rt vld1.8 {\rd[3]}, [\rs], \rt .endif .if \n == 8 || \hi == 1 vld1.8 {\rd[4]}, [\rs], \rt vld1.8 {\rd[5]}, [\rs], \rt vld1.8 {\rd[6]}, [\rs], \rt vld1.8 {\rd[7]}, [\rs], \rt .endif .endm .macro ldcol.16 rd1, rd2, rs, rt, ru add \ru, \rs, \rt, lsl #3 vld1.8 {\rd1[0]}, [\rs], \rt vld1.8 {\rd2[0]}, [\ru], \rt vld1.8 {\rd1[1]}, [\rs], \rt vld1.8 {\rd2[1]}, [\ru], \rt vld1.8 {\rd1[2]}, [\rs], \rt vld1.8 {\rd2[2]}, [\ru], \rt vld1.8 {\rd1[3]}, [\rs], \rt vld1.8 {\rd2[3]}, [\ru], \rt vld1.8 {\rd1[4]}, [\rs], \rt vld1.8 {\rd2[4]}, [\ru], \rt vld1.8 {\rd1[5]}, [\rs], \rt vld1.8 {\rd2[5]}, [\ru], \rt vld1.8 {\rd1[6]}, [\rs], \rt vld1.8 {\rd2[6]}, [\ru], \rt vld1.8 {\rd1[7]}, [\rs], \rt vld1.8 {\rd2[7]}, [\ru], \rt .endm .macro add16x8 dq, dl, dh, rl, rh vaddl.u8 \dq, \rl, \rh vadd.u16 \dl, \dl, \dh vpadd.u16 \dl, \dl, \dl vpadd.u16 \dl, \dl, \dl .endm // because gcc doesn't believe in using the free shift in add function predict_4x4_h_armv6 ldrb r1, [r0, #0*FDEC_STRIDE-1] ldrb r2, [r0, #1*FDEC_STRIDE-1] ldrb r3, [r0, #2*FDEC_STRIDE-1] ldrb ip, [r0, #3*FDEC_STRIDE-1] add r1, r1, r1, lsl #8 add r2, r2, r2, lsl #8 add r3, r3, r3, lsl #8 add ip, ip, ip, lsl #8 add r1, r1, r1, lsl #16 str r1, [r0, #0*FDEC_STRIDE] add r2, r2, r2, lsl #16 str r2, [r0, #1*FDEC_STRIDE] add r3, r3, r3, lsl #16 str r3, [r0, #2*FDEC_STRIDE] add ip, ip, ip, lsl #16 str ip, [r0, #3*FDEC_STRIDE] bx lr endfunc function predict_4x4_v_armv6 ldr r1, [r0, #0 - 1 * FDEC_STRIDE] str r1, [r0, #0 + 0 * FDEC_STRIDE] str r1, [r0, #0 + 1 * FDEC_STRIDE] str r1, [r0, #0 + 2 * FDEC_STRIDE] str r1, [r0, #0 + 3 * FDEC_STRIDE] bx lr endfunc function predict_4x4_dc_armv6 mov ip, #0 ldr r1, [r0, #-FDEC_STRIDE] ldrb r2, [r0, #0*FDEC_STRIDE-1] ldrb r3, [r0, #1*FDEC_STRIDE-1] usad8 r1, r1, ip add r2, r2, #4 ldrb ip, [r0, #2*FDEC_STRIDE-1] add r2, r2, r3 ldrb r3, [r0, #3*FDEC_STRIDE-1] add r2, r2, ip add r2, r2, r3 add r1, r1, r2 lsr r1, r1, #3 add r1, r1, r1, lsl #8 add r1, r1, r1, lsl #16 str r1, [r0, #0*FDEC_STRIDE] str r1, [r0, #1*FDEC_STRIDE] str r1, [r0, #2*FDEC_STRIDE] str r1, [r0, #3*FDEC_STRIDE] bx lr endfunc function predict_4x4_dc_top_neon mov r12, #FDEC_STRIDE sub r1, r0, #FDEC_STRIDE vld1.32 d1[], [r1,:32] vpaddl.u8 d1, d1 vpadd.u16 d1, d1, d1 vrshr.u16 d1, d1, #2 vdup.8 d1, d1[0] vst1.32 d1[0], [r0,:32], r12 vst1.32 d1[0], [r0,:32], r12 vst1.32 d1[0], [r0,:32], r12 vst1.32 d1[0], [r0,:32], r12 bx lr endfunc // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 uhadd8 \a1, \a1, \c1 uhadd8 \a2, \a2, \c2 uhadd8 \c1, \a1, \b1 uhadd8 \c2, \a2, \b2 eor \a1, \a1, \b1 eor \a2, \a2, \b2 and \a1, \a1, \pb_1 and \a2, \a2, \pb_1 uadd8 \a1, \a1, \c1 uadd8 \a2, \a2, \c2 .endm function predict_4x4_ddr_armv6 ldr r1, [r0, # -FDEC_STRIDE] ldrb r2, [r0, # -FDEC_STRIDE-1] ldrb r3, [r0, #0*FDEC_STRIDE-1] push {r4-r6,lr} add r2, r2, r1, lsl #8 ldrb r4, [r0, #1*FDEC_STRIDE-1] add r3, r3, r2, lsl #8 ldrb r5, [r0, #2*FDEC_STRIDE-1] ldrb r6, [r0, #3*FDEC_STRIDE-1] add r4, r4, r3, lsl #8 add r5, r5, r4, lsl #8 add r6, r6, r5, lsl #8 ldr ip, =0x01010101 PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip str r1, [r0, #0*FDEC_STRIDE] lsl r2, r1, #8 lsl r3, r1, #16 lsl r4, r4, #8 lsl r5, r1, #24 add r2, r2, r4, lsr #24 str r2, [r0, #1*FDEC_STRIDE] add r3, r3, r4, lsr #16 str r3, [r0, #2*FDEC_STRIDE] add r5, r5, r4, lsr #8 str r5, [r0, #3*FDEC_STRIDE] pop {r4-r6,pc} endfunc function predict_4x4_ddl_neon sub r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0}, [r0], ip vdup.8 d3, d0[7] vext.8 d1, d0, d0, #1 vext.8 d2, d0, d3, #2 vhadd.u8 d0, d0, d2 vrhadd.u8 d0, d0, d1 vst1.32 {d0[0]}, [r0,:32], ip vext.8 d1, d0, d0, #1 vext.8 d2, d0, d0, #2 vst1.32 {d1[0]}, [r0,:32], ip vext.8 d3, d0, d0, #3 vst1.32 {d2[0]}, [r0,:32], ip vst1.32 {d3[0]}, [r0,:32], ip bx lr endfunc function predict_8x8_dc_neon mov ip, #0 ldrd r2, r3, [r1, #8] push {r4-r5,lr} ldrd r4, r5, [r1, #16] lsl r3, r3, #8 ldrb lr, [r1, #7] usad8 r2, r2, ip usad8 r3, r3, ip usada8 r2, r4, ip, r2 add lr, lr, #8 usada8 r3, r5, ip, r3 add r2, r2, lr mov ip, #FDEC_STRIDE add r2, r2, r3 lsr r2, r2, #4 vdup.8 d0, r2 .rept 8 vst1.64 {d0}, [r0,:64], ip .endr pop {r4-r5,pc} endfunc function predict_8x8_h_neon add r1, r1, #7 mov ip, #FDEC_STRIDE vld1.64 {d16}, [r1] vdup.8 d0, d16[7] vdup.8 d1, d16[6] vst1.64 {d0}, [r0,:64], ip vdup.8 d2, d16[5] vst1.64 {d1}, [r0,:64], ip vdup.8 d3, d16[4] vst1.64 {d2}, [r0,:64], ip vdup.8 d4, d16[3] vst1.64 {d3}, [r0,:64], ip vdup.8 d5, d16[2] vst1.64 {d4}, [r0,:64], ip vdup.8 d6, d16[1] vst1.64 {d5}, [r0,:64], ip vdup.8 d7, d16[0] vst1.64 {d6}, [r0,:64], ip vst1.64 {d7}, [r0,:64], ip bx lr endfunc function predict_8x8_v_neon add r1, r1, #16 mov r12, #FDEC_STRIDE vld1.8 {d0}, [r1,:64] .rept 8 vst1.8 {d0}, [r0,:64], r12 .endr bx lr endfunc function predict_8x8_ddl_neon add r1, #16 vld1.8 {d0, d1}, [r1,:128] vmov.i8 q3, #0 vrev64.8 d2, d1 vext.8 q8, q3, q0, #15 vext.8 q2, q0, q1, #1 vhadd.u8 q8, q2 mov r12, #FDEC_STRIDE vrhadd.u8 q0, q8 vext.8 d2, d0, d1, #1 vext.8 d3, d0, d1, #2 vst1.8 d2, [r0,:64], r12 vext.8 d2, d0, d1, #3 vst1.8 d3, [r0,:64], r12 vext.8 d3, d0, d1, #4 vst1.8 d2, [r0,:64], r12 vext.8 d2, d0, d1, #5 vst1.8 d3, [r0,:64], r12 vext.8 d3, d0, d1, #6 vst1.8 d2, [r0,:64], r12 vext.8 d2, d0, d1, #7 vst1.8 d3, [r0,:64], r12 vst1.8 d2, [r0,:64], r12 vst1.8 d1, [r0,:64], r12 bx lr endfunc function predict_8x8_ddr_neon vld1.8 {d0-d3}, [r1,:128] vext.8 q2, q0, q1, #7 vext.8 q3, q0, q1, #9 vhadd.u8 q2, q2, q3 vrhadd.u8 d0, d1, d4 vrhadd.u8 d1, d2, d5 add r0, #7*FDEC_STRIDE mov r12, #-1*FDEC_STRIDE vext.8 d2, d0, d1, #1 vst1.8 {d0}, [r0,:64], r12 vext.8 d4, d0, d1, #2 vst1.8 {d2}, [r0,:64], r12 vext.8 d5, d0, d1, #3 vst1.8 {d4}, [r0,:64], r12 vext.8 d4, d0, d1, #4 vst1.8 {d5}, [r0,:64], r12 vext.8 d5, d0, d1, #5 vst1.8 {d4}, [r0,:64], r12 vext.8 d4, d0, d1, #6 vst1.8 {d5}, [r0,:64], r12 vext.8 d5, d0, d1, #7 vst1.8 {d4}, [r0,:64], r12 vst1.8 {d5}, [r0,:64], r12 bx lr endfunc function predict_8x8_vl_neon add r1, #16 mov r12, #FDEC_STRIDE vld1.8 {d0, d1}, [r1,:128] vext.8 q1, q1, q0, #15 vext.8 q2, q0, q2, #1 vrhadd.u8 q3, q0, q2 vhadd.u8 q1, q1, q2 vrhadd.u8 q0, q0, q1 vext.8 d2, d0, d1, #1 vst1.8 {d6}, [r0,:64], r12 vext.8 d3, d6, d7, #1 vst1.8 {d2}, [r0,:64], r12 vext.8 d2, d0, d1, #2 vst1.8 {d3}, [r0,:64], r12 vext.8 d3, d6, d7, #2 vst1.8 {d2}, [r0,:64], r12 vext.8 d2, d0, d1, #3 vst1.8 {d3}, [r0,:64], r12 vext.8 d3, d6, d7, #3 vst1.8 {d2}, [r0,:64], r12 vext.8 d2, d0, d1, #4 vst1.8 {d3}, [r0,:64], r12 vst1.8 {d2}, [r0,:64], r12 bx lr endfunc function predict_8x8_vr_neon add r1, #8 mov r12, #FDEC_STRIDE vld1.8 {d4,d5}, [r1,:64] vext.8 q1, q2, q2, #14 vext.8 q0, q2, q2, #15 vhadd.u8 q3, q2, q1 vrhadd.u8 q2, q2, q0 vrhadd.u8 q0, q0, q3 vmov d2, d0 vst1.8 {d5}, [r0,:64], r12 vuzp.8 d2, d0 vst1.8 {d1}, [r0,:64], r12 vext.8 d6, d0, d5, #7 vext.8 d3, d2, d1, #7 vst1.8 {d6}, [r0,:64], r12 vst1.8 {d3}, [r0,:64], r12 vext.8 d6, d0, d5, #6 vext.8 d3, d2, d1, #6 vst1.8 {d6}, [r0,:64], r12 vst1.8 {d3}, [r0,:64], r12 vext.8 d6, d0, d5, #5 vext.8 d3, d2, d1, #5 vst1.8 {d6}, [r0,:64], r12 vst1.8 {d3}, [r0,:64], r12 bx lr endfunc function predict_8x8_hd_neon mov r12, #FDEC_STRIDE add r1, #7 vld1.8 {d2,d3}, [r1] vext.8 q3, q1, q1, #1 vext.8 q2, q1, q1, #2 vrhadd.u8 q8, q1, q3 vhadd.u8 q1, q2 vrhadd.u8 q0, q1, q3 vzip.8 d16, d0 vext.8 d2, d0, d1, #6 vext.8 d3, d0, d1, #4 vst1.8 {d2}, [r0,:64], r12 vext.8 d2, d0, d1, #2 vst1.8 {d3}, [r0,:64], r12 vst1.8 {d2}, [r0,:64], r12 vext.8 d2, d16, d0, #6 vst1.8 {d0}, [r0,:64], r12 vext.8 d3, d16, d0, #4 vst1.8 {d2}, [r0,:64], r12 vext.8 d2, d16, d0, #2 vst1.8 {d3}, [r0,:64], r12 vst1.8 {d2}, [r0,:64], r12 vst1.8 {d16}, [r0,:64], r12 bx lr endfunc function predict_8x8_hu_neon mov r12, #FDEC_STRIDE add r1, #7 vld1.8 {d7}, [r1] vdup.8 d6, d7[0] vrev64.8 d7, d7 vext.8 d4, d7, d6, #2 vext.8 d2, d7, d6, #1 vhadd.u8 d16, d7, d4 vrhadd.u8 d0, d2, d7 vrhadd.u8 d1, d16, d2 vzip.8 d0, d1 vdup.16 q1, d1[3] vext.8 q2, q0, q1, #2 vext.8 q3, q0, q1, #4 vext.8 q8, q0, q1, #6 vst1.8 {d0}, [r0,:64], r12 vst1.8 {d4}, [r0,:64], r12 vst1.8 {d6}, [r0,:64], r12 vst1.8 {d16}, [r0,:64], r12 vst1.8 {d1}, [r0,:64], r12 vst1.8 {d5}, [r0,:64], r12 vst1.8 {d7}, [r0,:64], r12 vst1.8 {d17}, [r0,:64] bx lr endfunc function predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {d0}, [r2,:64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 d1, d0[1] vdup.8 d0, d0[0] vtrn.32 d0, d1 b pred8x8_dc_end endfunc function predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE sub r2, r0, #1 ldcol.8 d0, r2, r1 vpaddl.u8 d0, d0 vpadd.u16 d0, d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 d1, d0[1] vdup.8 d0, d0[0] b pred8x8_dc_end endfunc function predict_8x8c_dc_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {d0}, [r2,:64] sub r2, r0, #1 ldcol.8 d1, r2, r1 vtrn.32 d0, d1 vpaddl.u8 q0, q0 vpadd.u16 d0, d0, d1 vpadd.u16 d1, d0, d0 vrshrn.u16 d2, q0, #3 vrshrn.u16 d3, q0, #2 vdup.8 d0, d2[4] vdup.8 d1, d3[3] vdup.8 d4, d3[2] vdup.8 d5, d2[5] vtrn.32 q0, q2 pred8x8_dc_end: add r2, r0, r1, lsl #2 .rept 4 vst1.8 {d0}, [r0,:64], r1 vst1.8 {d1}, [r2,:64], r1 .endr bx lr endfunc function predict_8x8c_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 4 vld1.8 {d0[]}, [r1], ip vld1.8 {d2[]}, [r1], ip vst1.64 {d0}, [r0,:64], ip vst1.64 {d2}, [r0,:64], ip .endr bx lr endfunc function predict_8x8c_v_neon sub r0, r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0}, [r0,:64], ip .rept 8 vst1.64 {d0}, [r0,:64], ip .endr bx lr endfunc function predict_8x8c_p_neon sub r3, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE add r2, r3, #4 sub r3, r3, #1 vld1.32 {d0[0]}, [r3] vld1.32 {d2[0]}, [r2,:32], r1 ldcol.8 d0, r3, r1, 4, hi=1 add r3, r3, r1 ldcol.8 d3, r3, r1, 4 vaddl.u8 q8, d2, d3 vrev32.8 d0, d0 vtrn.32 d2, d3 vsubl.u8 q2, d2, d0 movrel r3, p16weight vld1.16 {q0}, [r3,:128] vmul.s16 d4, d4, d0 vmul.s16 d5, d5, d0 vpadd.i16 d4, d4, d5 vpaddl.s16 d4, d4 vshl.i32 d5, d4, #4 vadd.s32 d4, d4, d5 vrshrn.s32 d4, q2, #5 mov r3, #0 vtrn.16 d4, d5 vadd.i16 d2, d4, d5 vshl.i16 d3, d2, #2 vrev64.16 d16, d16 vsub.i16 d3, d3, d2 vadd.i16 d16, d16, d0 vshl.i16 d2, d16, #4 vsub.i16 d2, d2, d3 vext.16 q0, q0, q0, #7 vmov.16 d0[0], r3 vmul.i16 q0, q0, d4[0] vdup.16 q1, d2[0] vdup.16 q3, d5[0] vadd.i16 q1, q1, q0 mov r3, #8 1: vqshrun.s16 d0, q1, #5 vadd.i16 q1, q1, q3 vst1.8 {d0}, [r0,:64], r1 subs r3, r3, #1 bne 1b bx lr endfunc function predict_8x16c_dc_top_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {d0}, [r2,:64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 d1, d0[1] vdup.8 d0, d0[0] vtrn.32 d0, d1 add r2, r0, r1, lsl #2 .rept 4 vst1.8 {d0}, [r0,:64], r1 vst1.8 {d1}, [r2,:64], r1 .endr add r2, r2, r1, lsl #2 add r0, r0, r1, lsl #2 .rept 4 vst1.8 {d0}, [r0,:64], r1 vst1.8 {d1}, [r2,:64], r1 .endr bx lr endfunc function predict_8x16c_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 8 vld1.8 {d0[]}, [r1], ip vld1.8 {d2[]}, [r1], ip vst1.64 {d0}, [r0,:64], ip vst1.64 {d2}, [r0,:64], ip .endr bx lr endfunc function predict_8x16c_p_neon sub r3, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE add r2, r3, #4 sub r3, r3, #1 vld1.32 {d0[0]}, [r3] vld1.32 {d2[0]}, [r2,:32], r1 ldcol.8 d1, r3, r1 add r3, r3, r1 ldcol.8 d3, r3, r1 vrev64.32 d16, d3 vaddl.u8 q8, d2, d16 vrev32.8 d0, d0 vsubl.u8 q2, d2, d0 vrev64.8 d1, d1 vsubl.u8 q3, d3, d1 movrel r3, p16weight vld1.16 {q0}, [r3,:128] vmul.s16 d4, d4, d0 vmul.s16 q3, q3, q0 vpadd.i16 d4, d4, d5 vpadd.i16 d6, d6, d7 vpaddl.s16 d4, d4 @ d4[0] = H vpaddl.s16 d6, d6 vpadd.s32 d6, d6 @ d6[0] = V vshl.i32 d5, d4, #4 vadd.s32 d4, d4, d5 @ d4[0] = 17*H vshl.i32 d7, d6, #2 vrshrn.s32 d4, q2, #5 @ d4[0] = b vadd.s32 d6, d6, d7 @ d6[0] = 5*V vrshrn.s32 d6, q3, #6 @ d6[0] = c mov r3, #0 vshl.i16 d3, d4, #2 vsub.i16 d3, d3, d4 @ d2[0] = 3 * b vshl.i16 d2, d6, #3 vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c vrev64.16 d16, d16 vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1 vshl.i16 d2, d16, #4 @ d3[0] = a + 16 vsub.i16 d2, d2, d3 @ i00 vext.16 q0, q0, q0, #7 vmov.16 d0[0], r3 vmul.i16 q0, q0, d4[0] vdup.16 q1, d2[0] vdup.16 q3, d6[0] vadd.i16 q1, q1, q0 mov r3, #16 1: vqshrun.s16 d0, q1, #5 vadd.i16 q1, q1, q3 vst1.8 {d0}, [r0,:64], r1 subs r3, r3, #1 bne 1b bx lr endfunc function predict_16x16_dc_top_neon sub r2, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE vld1.8 {q0}, [r2,:128] add16x8 q0, d0, d1, d0, d1 vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] b pred16x16_dc_end endfunc function predict_16x16_dc_left_neon mov r1, #FDEC_STRIDE sub r2, r0, #1 ldcol.8 d0, r2, r1 ldcol.8 d1, r2, r1 add16x8 q0, d0, d1, d0, d1 vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] b pred16x16_dc_end endfunc function predict_16x16_dc_neon sub r3, r0, #FDEC_STRIDE sub r0, r0, #1 vld1.64 {d0-d1}, [r3,:128] ldrb ip, [r0], #FDEC_STRIDE vaddl.u8 q0, d0, d1 ldrb r1, [r0], #FDEC_STRIDE vadd.u16 d0, d0, d1 vpadd.u16 d0, d0, d0 vpadd.u16 d0, d0, d0 .rept 4 ldrb r2, [r0], #FDEC_STRIDE add ip, ip, r1 ldrb r3, [r0], #FDEC_STRIDE add ip, ip, r2 ldrb r1, [r0], #FDEC_STRIDE add ip, ip, r3 .endr ldrb r2, [r0], #FDEC_STRIDE add ip, ip, r1 ldrb r3, [r0], #FDEC_STRIDE add ip, ip, r2 sub r0, r0, #FDEC_STRIDE*16 add ip, ip, r3 vdup.16 d1, ip vadd.u16 d0, d0, d1 mov r1, #FDEC_STRIDE add r0, r0, #1 vrshr.u16 d0, d0, #5 vdup.8 q0, d0[0] pred16x16_dc_end: .rept 16 vst1.64 {d0-d1}, [r0,:128], r1 .endr bx lr endfunc function predict_16x16_h_neon sub r1, r0, #1 mov ip, #FDEC_STRIDE .rept 8 vld1.8 {d0[]}, [r1], ip vmov d1, d0 vld1.8 {d2[]}, [r1], ip vmov d3, d2 vst1.64 {d0-d1}, [r0,:128], ip vst1.64 {d2-d3}, [r0,:128], ip .endr bx lr endfunc function predict_16x16_v_neon sub r0, r0, #FDEC_STRIDE mov ip, #FDEC_STRIDE vld1.64 {d0-d1}, [r0,:128], ip .rept 16 vst1.64 {d0-d1}, [r0,:128], ip .endr bx lr endfunc function predict_16x16_p_neon sub r3, r0, #FDEC_STRIDE mov r1, #FDEC_STRIDE add r2, r3, #8 sub r3, r3, #1 vld1.8 {d0}, [r3] vld1.8 {d2}, [r2,:64], r1 ldcol.8 d1, r3, r1 add r3, r3, r1 ldcol.8 d3, r3, r1 vrev64.8 q0, q0 vaddl.u8 q8, d2, d3 vsubl.u8 q2, d2, d0 vsubl.u8 q3, d3, d1 movrel r3, p16weight vld1.8 {q0}, [r3,:128] vmul.s16 q2, q2, q0 vmul.s16 q3, q3, q0 vadd.i16 d4, d4, d5 vadd.i16 d5, d6, d7 vpadd.i16 d4, d4, d5 vpadd.i16 d4, d4, d4 vshll.s16 q3, d4, #2 vaddw.s16 q2, q3, d4 vrshrn.s32 d4, q2, #6 mov r3, #0 vtrn.16 d4, d5 vadd.i16 d2, d4, d5 vshl.i16 d3, d2, #3 vrev64.16 d16, d17 vsub.i16 d3, d3, d2 vadd.i16 d16, d16, d0 vshl.i16 d2, d16, #4 vsub.i16 d2, d2, d3 vshl.i16 d3, d4, #4 vext.16 q0, q0, q0, #7 vsub.i16 d6, d5, d3 vmov.16 d0[0], r3 vmul.i16 q0, q0, d4[0] vdup.16 q1, d2[0] vdup.16 q2, d4[0] vdup.16 q3, d6[0] vshl.i16 q2, q2, #3 vadd.i16 q1, q1, q0 vadd.i16 q3, q3, q2 mov r3, #16 1: vqshrun.s16 d0, q1, #5 vadd.i16 q1, q1, q2 vqshrun.s16 d1, q1, #5 vadd.i16 q1, q1, q3 vst1.8 {q0}, [r0,:128], r1 subs r3, r3, #1 bne 1b bx lr endfunc x264-master/common/arm/predict-c.c000066400000000000000000000075751502133446700171620ustar00rootroot00000000000000/***************************************************************************** * predict.c: arm intra prediction ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "predict.h" #include "pixel.h" void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] ) { if( !(cpu&X264_CPU_ARMV6) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6; if( !(cpu&X264_CPU_NEON) ) return; pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH /* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon; pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; #endif // !HIGH_BIT_DEPTH } void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_NEON) ) return; #if !HIGH_BIT_DEPTH pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; #endif // !HIGH_BIT_DEPTH } x264-master/common/arm/predict.h000066400000000000000000000132451502133446700167360ustar00rootroot00000000000000/***************************************************************************** * predict.h: arm intra prediction ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_PREDICT_H #define X264_ARM_PREDICT_H #define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6) void x264_predict_4x4_dc_armv6( uint8_t *src ); #define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon) void x264_predict_4x4_dc_top_neon( uint8_t *src ); #define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6) void x264_predict_4x4_v_armv6( uint8_t *src ); #define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6) void x264_predict_4x4_h_armv6( uint8_t *src ); #define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6) void x264_predict_4x4_ddr_armv6( uint8_t *src ); #define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon) void x264_predict_4x4_ddl_neon( uint8_t *src ); #define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon) void x264_predict_8x8c_dc_neon( uint8_t *src ); #define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon) void x264_predict_8x8c_dc_top_neon( uint8_t *src ); #define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon) void x264_predict_8x8c_dc_left_neon( uint8_t *src ); #define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon) void x264_predict_8x8c_h_neon( uint8_t *src ); #define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon) void x264_predict_8x8c_v_neon( uint8_t *src ); #define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon) void x264_predict_8x8c_p_neon( uint8_t *src ); #define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon) void x264_predict_8x16c_h_neon( uint8_t *src ); #define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon) void x264_predict_8x16c_dc_top_neon( uint8_t *src ); #define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon) void x264_predict_8x16c_p_neon( uint8_t *src ); #define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon) void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon) void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon) void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon) void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon) void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon) void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon) void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon) void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon) void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); #define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon) void x264_predict_16x16_dc_neon( uint8_t *src ); #define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon) void x264_predict_16x16_dc_top_neon( uint8_t *src ); #define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon) void x264_predict_16x16_dc_left_neon( uint8_t *src ); #define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon) void x264_predict_16x16_h_neon( uint8_t *src ); #define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon) void x264_predict_16x16_v_neon( uint8_t *src ); #define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon) void x264_predict_16x16_p_neon( uint8_t *src ); #define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm) void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] ); #define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm) void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); #define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm) void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm) void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm) void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] ); #endif x264-master/common/arm/quant-a.S000066400000000000000000000355031502133446700166260ustar00rootroot00000000000000/**************************************************************************** * quant.S: arm quantization and level-run ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: David Conrad * Janne Grunau * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "asm.S" const pmovmskb_byte, align=4 .byte 1,2,4,8,16,32,64,128 .byte 1,2,4,8,16,32,64,128 endconst const mask_2bit, align=4 .byte 3,12,48,192,3,12,48,192 .byte 3,12,48,192,3,12,48,192 endconst const mask_1bit, align=4 .byte 128,64,32,16,8,4,2,1 .byte 128,64,32,16,8,4,2,1 endconst .text .macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no vadd.u16 q8, q8, \bias0 vadd.u16 q9, q9, \bias1 .ifc \load_mf, yes vld1.64 {\mf0-\mf3}, [r1,:128]! .endif vmull.u16 q10, d16, \mf0 vmull.u16 q11, d17, \mf1 vmull.u16 q12, d18, \mf2 vmull.u16 q13, d19, \mf3 vshr.s16 q14, q14, #15 vshr.s16 q15, q15, #15 vshrn.u32 d16, q10, #16 vshrn.u32 d17, q11, #16 vshrn.u32 d18, q12, #16 vshrn.u32 d19, q13, #16 veor q8, q8, q14 veor q9, q9, q15 vsub.s16 q8, q8, q14 vsub.s16 q9, q9, q15 vorr \mask, q8, q9 vst1.64 {d16-d19}, [r0,:128]! .endm .macro QUANT_END d vmov r2, r3, \d orrs r0, r2, r3 movne r0, #1 bx lr .endm // quant_2x2_dc( int16_t dct[4], int mf, int bias ) function quant_2x2_dc_neon vld1.64 {d0}, [r0,:64] vabs.s16 d3, d0 vdup.16 d2, r2 vdup.16 d1, r1 vadd.u16 d3, d3, d2 vmull.u16 q3, d3, d1 vshr.s16 d0, d0, #15 vshrn.u32 d3, q3, #16 veor d3, d3, d0 vsub.s16 d3, d3, d0 vst1.64 {d3}, [r0,:64] QUANT_END d3 endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) function quant_4x4_dc_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vdup.16 q0, r2 vdup.16 q2, r1 QUANT_TWO q0, q0, d4, d5, d4, d5, q0 vorr d0, d0, d1 QUANT_END d0 endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function quant_4x4_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128] vld1.64 {d4-d7}, [r1,:128] QUANT_TWO q0, q1, d4, d5, d6, d7, q0 vorr d0, d0, d1 QUANT_END d0 endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) function quant_4x4x4_neon vpush {d8-d15} vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128] vld1.64 {d4-d7}, [r1,:128] QUANT_TWO q0, q1, d4, d5, d6, d7, q4 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 QUANT_TWO q0, q1, d4, d5, d6, d7, q5 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 QUANT_TWO q0, q1, d4, d5, d6, d7, q6 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 QUANT_TWO q0, q1, d4, d5, d6, d7, q7 vorr d8, d8, d9 vorr d10, d10, d11 vorr d12, d12, d13 vorr d14, d14, d15 vmov r0, r1, d8 vmov r2, r3, d10 orrs r0, r1 movne r0, #1 orrs r2, r3 orrne r0, #2 vmov r1, r2, d12 vmov r3, ip, d14 orrs r1, r2 orrne r0, #4 orrs r3, ip orrne r0, #8 vpop {d8-d15} bx lr endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function quant_8x8_neon vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vld1.64 {d0-d3}, [r2,:128]! vld1.64 {d4-d7}, [r1,:128]! QUANT_TWO q0, q1, d4, d5, d6, d7, q0 .rept 3 vld1.64 {d28-d31}, [r0,:128] vabs.s16 q8, q14 vabs.s16 q9, q15 vld1.64 {d2-d5}, [r2,:128]! QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes vorr q0, q0, q1 .endr vorr d0, d0, d1 QUANT_END d0 endfunc .macro DEQUANT_START mf_size offset dc=no mov r3, #0x2b mul r3, r3, r2 lsr r3, r3, #8 // i_qbits = i_qp / 6 add ip, r3, r3, lsl #1 sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6 .ifc \dc,no add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf] .else ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0] .endif subs r3, r3, #\offset // 6 for 8x8 .endm // dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp ) .macro DEQUANT size bits function dequant_\size\()_neon DEQUANT_START \bits+2, \bits .ifc \size, 8x8 mov r2, #4 .endif blt dequant_\size\()_rshift vdup.16 q15, r3 dequant_\size\()_lshift_loop: .ifc \size, 8x8 subs r2, r2, #1 .endif vld1.32 {d16-d17}, [r1,:128]! vld1.32 {d18-d19}, [r1,:128]! vmovn.s32 d4, q8 vld1.32 {d20-d21}, [r1,:128]! vmovn.s32 d5, q9 vld1.32 {d22-d23}, [r1,:128]! vmovn.s32 d6, q10 vld1.16 {d0-d3}, [r0,:128] vmovn.s32 d7, q11 vmul.s16 q0, q0, q2 vmul.s16 q1, q1, q3 vshl.s16 q0, q0, q15 vshl.s16 q1, q1, q15 vst1.16 {d0-d3}, [r0,:128]! .ifc \size, 8x8 bgt dequant_\size\()_lshift_loop .endif bx lr dequant_\size\()_rshift: vdup.32 q15, r3 rsb r3, r3, #0 mov ip, #1 sub r3, r3, #1 lsl ip, ip, r3 .ifc \size, 8x8 dequant_\size\()_rshift_loop: subs r2, r2, #1 .endif vdup.32 q10, ip vld1.32 {d16-d17}, [r1,:128]! vdup.32 q11, ip vld1.32 {d18-d19}, [r1,:128]! vmovn.s32 d4, q8 vld1.32 {d16-d17}, [r1,:128]! vmovn.s32 d5, q9 vld1.32 {d18-d19}, [r1,:128]! vmovn.s32 d6, q8 vld1.16 {d0-d3}, [r0,:128] vmovn.s32 d7, q9 vdup.32 q12, ip vdup.32 q13, ip vmlal.s16 q10, d0, d4 vmlal.s16 q11, d1, d5 vmlal.s16 q12, d2, d6 vmlal.s16 q13, d3, d7 vshl.s32 q10, q10, q15 vshl.s32 q11, q11, q15 vshl.s32 q12, q12, q15 vshl.s32 q13, q13, q15 vmovn.s32 d0, q10 vmovn.s32 d1, q11 vmovn.s32 d2, q12 vmovn.s32 d3, q13 vst1.16 {d0-d3}, [r0,:128]! .ifc \size, 8x8 bgt dequant_\size\()_rshift_loop .endif bx lr endfunc .endm DEQUANT 4x4, 4 DEQUANT 8x8, 6 // dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp ) function dequant_4x4_dc_neon DEQUANT_START 6, 6, yes blt dequant_4x4_dc_rshift lsl r1, r1, r3 vdup.16 q2, r1 vld1.16 {d0-d3}, [r0,:128] vdup.16 q15, r3 vmul.s16 q0, q0, q2 vmul.s16 q1, q1, q2 vst1.16 {d0-d3}, [r0,:128] bx lr dequant_4x4_dc_rshift: vdup.16 d4, r1 vdup.32 q15, r3 rsb r3, r3, #0 mov ip, #1 sub r3, r3, #1 lsl ip, ip, r3 vdup.32 q10, ip vdup.32 q11, ip vld1.16 {d0-d3}, [r0,:128] vdup.32 q12, ip vdup.32 q13, ip vmlal.s16 q10, d0, d4 vmlal.s16 q11, d1, d4 vmlal.s16 q12, d2, d4 vmlal.s16 q13, d3, d4 vshl.s32 q10, q10, q15 vshl.s32 q11, q11, q15 vshl.s32 q12, q12, q15 vshl.s32 q13, q13, q15 vmovn.s32 d0, q10 vmovn.s32 d1, q11 vmovn.s32 d2, q12 vmovn.s32 d3, q13 vst1.16 {d0-d3}, [r0,:128] bx lr endfunc .macro decimate_score_1x size function decimate_score\size\()_neon vld1.16 {q0, q1}, [r0, :128] movrel r3, mask_2bit vmov.s8 q3, #0x01 vqmovn.s16 d0, q0 vqmovn.s16 d1, q1 vqabs.s8 q2, q0 vld1.8 {q8}, [r3, :128] vceq.s8 q1, q0, #0 vcgt.s8 q2, q2, q3 vand.u8 q1, q1, q8 vshrn.u16 d4, q2, #4 vpadd.u8 d2, d2, d3 vpadd.u8 d4, d4, d4 vpadd.u8 d2, d2, d2 vmov.32 r2, d4[0] vmov.32 r1, d2[0] cmp r2, #0 beq 0f mov r0, #9 bx lr 0: mvns r1, r1 mov r0, #0 bxeq lr .ifc \size, 15 lsr r1, r1, #2 .endif rbit r1, r1 movrelx r3, X264(decimate_table4), r2 1: clz r2, r1 lsl r1, r1, r2 lsr r12, r2, #1 ldrb r2, [r3, r12] lsls r1, r1, #2 add r0, r0, r2 bne 1b bx lr endfunc .endm decimate_score_1x 15 decimate_score_1x 16 function decimate_score64_neon push {lr} vld1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r0, :128] movrel r3, mask_1bit vmov.s8 q3, #0x01 vqmovn.s16 d17, q8 vqmovn.s16 d16, q9 vqmovn.s16 d19, q10 vqmovn.s16 d18, q11 vqmovn.s16 d21, q12 vqmovn.s16 d20, q13 vqmovn.s16 d23, q14 vqmovn.s16 d22, q15 vqabs.s8 q12, q8 vqabs.s8 q13, q9 vqabs.s8 q14, q10 vqabs.s8 q15, q11 vld1.8 {q2}, [r3, :128] vceq.s8 q8, q8, #0 vceq.s8 q9, q9, #0 vceq.s8 q10, q10, #0 vceq.s8 q11, q11, #0 vmax.s8 q12, q12, q13 vmax.s8 q14, q14, q15 vand.u8 q8, q8, q2 vand.u8 q9, q9, q2 vand.u8 q10, q10, q2 vand.u8 q11, q11, q2 vmax.s8 q12, q12, q14 vpadd.u8 d18, d18, d19 vpadd.u8 d19, d16, d17 vcgt.s8 q12, q12, q3 vpadd.u8 d22, d22, d23 vpadd.u8 d23, d20, d21 vshrn.u16 d24, q12, #4 vpadd.u8 d16, d22, d23 vpadd.u8 d17, d18, d19 vpadd.u8 d24, d24, d24 vpadd.u8 d16, d16, d17 vmov.32 r2, d24[0] vmov r12, r1, d16 cmp r2, #0 beq 0f mov r0, #9 pop {pc} 0: mvns r1, r1 mvn r12, r12 mov r0, #0 mov lr, #32 movrelx r3, X264(decimate_table8), r2 beq 2f 1: clz r2, r1 lsl r1, r1, r2 sub lr, lr, r2 ldrb r2, [r3, r2] lsls r1, r1, #1 sub lr, lr, #1 add r0, r0, r2 bne 1b 2: cmp r12, #0 popeq {pc} clz r2, r12 lsl r1, r12, r2 add r2, r2, lr ldrb r2, [r3, r2] lsls r1, r1, #1 add r0, r0, r2 popeq {pc} 3: clz r2, r1 lsl r1, r1, r2 ldrb r2, [r3, r2] lsls r1, r1, #1 add r0, r0, r2 bne 3b pop {pc} endfunc // int coeff_last( int16_t *l ) function coeff_last4_arm ldrd r2, r3, [r0] subs r0, r3, #0 movne r0, #2 movne r2, r3 lsrs r2, r2, #16 addne r0, r0, #1 bx lr endfunc function coeff_last8_arm ldrd r2, r3, [r0, #8] orrs ip, r2, r3 movne r0, #4 ldrdeq r2, r3, [r0] moveq r0, #0 tst r3, r3 addne r0, #2 movne r2, r3 lsrs r2, r2, #16 addne r0, r0, #1 bx lr endfunc .macro COEFF_LAST_1x size function coeff_last\size\()_neon .if \size == 15 sub r0, r0, #2 .endif vld1.64 {d0-d3}, [r0,:128] vtst.16 q0, q0 vtst.16 q1, q1 vshrn.u16 d0, q0, #8 vshrn.u16 d1, q1, #8 vshrn.u16 d0, q0, #4 vclz.i32 d0, d0 mov ip, #7 mov r3, #\size - 9 vmov r0, r1, d0 subs r1, ip, r1, lsr #2 addge r0, r1, #\size - 8 subslt r0, r3, r0, lsr #2 movlt r0, #0 bx lr endfunc .endm COEFF_LAST_1x 15 COEFF_LAST_1x 16 function coeff_last64_neon vld1.64 {d16-d19}, [r0,:128]! vqmovn.u16 d16, q8 vqmovn.u16 d17, q9 vld1.64 {d20-d23}, [r0,:128]! vqmovn.u16 d18, q10 vqmovn.u16 d19, q11 vld1.64 {d24-d27}, [r0,:128]! vqmovn.u16 d20, q12 vqmovn.u16 d21, q13 vld1.64 {d28-d31}, [r0,:128]! vqmovn.u16 d22, q14 vqmovn.u16 d23, q15 movrel r1, pmovmskb_byte vld1.64 {d0-d1}, [r1,:128] vtst.8 q8, q8 vtst.8 q9, q9 vtst.8 q10, q10 vtst.8 q11, q11 vand q8, q8, q0 vand q9, q9, q0 vand q10, q10, q0 vand q11, q11, q0 vpadd.u8 d0, d16, d17 vpadd.u8 d1, d18, d19 vpadd.u8 d2, d20, d21 vpadd.u8 d3, d22, d23 vpadd.u8 d0, d0, d1 vpadd.u8 d1, d2, d3 vpadd.u8 d0, d0, d1 vclz.i32 d0, d0 mov ip, #31 vmov r0, r1, d0 subs r1, ip, r1 addge r0, r1, #32 subslt r0, ip, r0 movlt r0, #0 bx lr endfunc function denoise_dct_neon 1: subs r3, r3, #16 vld1.16 {q0, q1}, [r0] vld1.32 {q12, q13}, [r1]! vld1.32 {q14, q15}, [r1] sub r1, #32 vabs.s16 q8, q0 vabs.s16 q9, q1 vld1.16 {q2, q3}, [r2]! vclt.s16 q10, q0, #0 vclt.s16 q11, q1, #0 vaddw.u16 q12, q12, d16 vaddw.u16 q13, q13, d17 vqsub.u16 q0, q8, q2 vqsub.u16 q1, q9, q3 vaddw.u16 q14, q14, d18 vaddw.u16 q15, q15, d19 vneg.s16 q8, q0 vneg.s16 q9, q1 vbsl q10, q8, q0 vbsl q11, q9, q1 vst1.32 {q12, q13}, [r1]! vst1.32 {q14, q15}, [r1]! vst1.16 {q10, q11}, [r0]! bgt 1b bx lr endfunc x264-master/common/arm/quant.h000066400000000000000000000067121502133446700164350ustar00rootroot00000000000000/***************************************************************************** * quant.h: arm quantization and level-run ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: David Conrad * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ARM_QUANT_H #define X264_ARM_QUANT_H #define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6) int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias ); #define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon) int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); #define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon) int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); #define x264_quant_4x4_neon x264_template(quant_4x4_neon) int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); #define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon) int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); #define x264_quant_8x8_neon x264_template(quant_8x8_neon) int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); #define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon) void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4_neon x264_template(dequant_4x4_neon) void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_neon x264_template(dequant_8x8_neon) void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #define x264_decimate_score15_neon x264_template(decimate_score15_neon) int x264_decimate_score15_neon( int16_t * ); #define x264_decimate_score16_neon x264_template(decimate_score16_neon) int x264_decimate_score16_neon( int16_t * ); #define x264_decimate_score64_neon x264_template(decimate_score64_neon) int x264_decimate_score64_neon( int16_t * ); #define x264_coeff_last4_arm x264_template(coeff_last4_arm) int x264_coeff_last4_arm( int16_t * ); #define x264_coeff_last8_arm x264_template(coeff_last8_arm) int x264_coeff_last8_arm( int16_t * ); #define x264_coeff_last15_neon x264_template(coeff_last15_neon) int x264_coeff_last15_neon( int16_t * ); #define x264_coeff_last16_neon x264_template(coeff_last16_neon) int x264_coeff_last16_neon( int16_t * ); #define x264_coeff_last64_neon x264_template(coeff_last64_neon) int x264_coeff_last64_neon( int16_t * ); #define x264_denoise_dct_neon x264_template(denoise_dct_neon) void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int ); #endif x264-master/common/base.c000066400000000000000000001540641502133446700154370ustar00rootroot00000000000000/***************************************************************************** * base.c: misc common functions (bit depth independent) ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "base.h" #include #if HAVE_MALLOC_H #include #endif #if HAVE_THP #include #endif #define X264_ISDIGIT(x) isdigit((unsigned char)(x)) /**************************************************************************** * x264_reduce_fraction: ****************************************************************************/ #define REDUCE_FRACTION( name, type )\ void name( type *n, type *d )\ { \ type a = *n; \ type b = *d; \ type c; \ if( !a || !b ) \ return; \ c = a % b; \ while( c ) \ { \ a = b; \ b = c; \ c = a % b; \ } \ *n /= b; \ *d /= b; \ } REDUCE_FRACTION( x264_reduce_fraction , uint32_t ) REDUCE_FRACTION( x264_reduce_fraction64, uint64_t ) /**************************************************************************** * x264_log: ****************************************************************************/ void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg ) { char *psz_prefix; switch( i_level ) { case X264_LOG_ERROR: psz_prefix = "error"; break; case X264_LOG_WARNING: psz_prefix = "warning"; break; case X264_LOG_INFO: psz_prefix = "info"; break; case X264_LOG_DEBUG: psz_prefix = "debug"; break; default: psz_prefix = "unknown"; break; } fprintf( stderr, "x264 [%s]: ", psz_prefix ); x264_vfprintf( stderr, psz_fmt, arg ); } void x264_log_internal( int i_level, const char *psz_fmt, ... ) { va_list arg; va_start( arg, psz_fmt ); x264_log_default( NULL, i_level, psz_fmt, arg ); va_end( arg ); } /**************************************************************************** * x264_malloc: ****************************************************************************/ void *x264_malloc( int64_t i_size ) { #define HUGE_PAGE_SIZE 2*1024*1024 #define HUGE_PAGE_THRESHOLD HUGE_PAGE_SIZE*7/8 /* FIXME: Is this optimal? */ if( i_size < 0 || (uint64_t)i_size > (SIZE_MAX - HUGE_PAGE_SIZE) /*|| (uint64_t)i_size > (SIZE_MAX - NATIVE_ALIGN - sizeof(void **))*/ ) { x264_log_internal( X264_LOG_ERROR, "invalid size of malloc: %"PRId64"\n", i_size ); return NULL; } uint8_t *align_buf = NULL; #if HAVE_MALLOC_H #if HAVE_THP /* Attempt to allocate huge pages to reduce TLB misses. */ if( i_size >= HUGE_PAGE_THRESHOLD ) { align_buf = memalign( HUGE_PAGE_SIZE, i_size ); if( align_buf ) { /* Round up to the next huge page boundary if we are close enough. */ size_t madv_size = (i_size + HUGE_PAGE_SIZE - HUGE_PAGE_THRESHOLD) & ~(HUGE_PAGE_SIZE-1); madvise( align_buf, madv_size, MADV_HUGEPAGE ); } } else #endif align_buf = memalign( NATIVE_ALIGN, i_size ); #else uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) ); if( buf ) { align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **); align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1); *( (void **) ( align_buf - sizeof(void **) ) ) = buf; } #endif if( !align_buf ) x264_log_internal( X264_LOG_ERROR, "malloc of size %"PRId64" failed\n", i_size ); return align_buf; #undef HUGE_PAGE_SIZE #undef HUGE_PAGE_THRESHOLD } /**************************************************************************** * x264_free: ****************************************************************************/ void x264_free( void *p ) { if( p ) { #if HAVE_MALLOC_H free( p ); #else free( *( ( ( void **) p ) - 1 ) ); #endif } } /**************************************************************************** * x264_slurp_file: ****************************************************************************/ char *x264_slurp_file( const char *filename ) { int b_error = 0; int64_t i_size; char *buf; FILE *fh = x264_fopen( filename, "rb" ); if( !fh ) return NULL; b_error |= fseek( fh, 0, SEEK_END ) < 0; b_error |= ( i_size = ftell( fh ) ) <= 0; if( WORD_SIZE == 4 ) b_error |= i_size > INT32_MAX; b_error |= fseek( fh, 0, SEEK_SET ) < 0; if( b_error ) goto error; buf = x264_malloc( i_size+2 ); if( !buf ) goto error; b_error |= fread( buf, 1, i_size, fh ) != (uint64_t)i_size; fclose( fh ); if( b_error ) { x264_free( buf ); return NULL; } if( buf[i_size-1] != '\n' ) buf[i_size++] = '\n'; buf[i_size] = '\0'; return buf; error: fclose( fh ); return NULL; } /**************************************************************************** * x264_param_strdup: ****************************************************************************/ typedef struct { int size; int count; void *ptr[]; } strdup_buffer; #define BUFFER_OFFSET (int)offsetof(strdup_buffer, ptr) #define BUFFER_DEFAULT_SIZE 16 char *x264_param_strdup( x264_param_t *param, const char *src ) { strdup_buffer *buf = param->opaque; if( !buf ) { buf = malloc( BUFFER_OFFSET + BUFFER_DEFAULT_SIZE * sizeof(void *) ); if( !buf ) goto fail; buf->size = BUFFER_DEFAULT_SIZE; buf->count = 0; param->opaque = buf; } else if( buf->count == buf->size ) { if( buf->size > (INT_MAX - BUFFER_OFFSET) / 2 / (int)sizeof(void *) ) goto fail; int new_size = buf->size * 2; buf = realloc( buf, BUFFER_OFFSET + new_size * sizeof(void *) ); if( !buf ) goto fail; buf->size = new_size; param->opaque = buf; } char *res = strdup( src ); if( !res ) goto fail; buf->ptr[buf->count++] = res; return res; fail: x264_log_internal( X264_LOG_ERROR, "x264_param_strdup failed\n" ); return NULL; } /**************************************************************************** * x264_param_cleanup: ****************************************************************************/ REALIGN_STACK void x264_param_cleanup( x264_param_t *param ) { strdup_buffer *buf = param->opaque; if( buf ) { for( int i = 0; i < buf->count; i++ ) free( buf->ptr[i] ); free( buf ); param->opaque = NULL; } } /**************************************************************************** * x264_picture_init: ****************************************************************************/ REALIGN_STACK void x264_picture_init( x264_picture_t *pic ) { memset( pic, 0, sizeof( x264_picture_t ) ); pic->i_type = X264_TYPE_AUTO; pic->i_qpplus1 = X264_QP_AUTO; pic->i_pic_struct = PIC_STRUCT_AUTO; } /**************************************************************************** * x264_picture_alloc: ****************************************************************************/ REALIGN_STACK int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) { typedef struct { int planes; int width_fix8[3]; int height_fix8[3]; } x264_csp_tab_t; static const x264_csp_tab_t csp_tab[] = { [X264_CSP_I400] = { 1, { 256*1 }, { 256*1 } }, [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_NV12] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, [X264_CSP_NV21] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, }, [X264_CSP_YUYV] = { 1, { 256*2 }, { 256*1 }, }, [X264_CSP_UYVY] = { 1, { 256*2 }, { 256*1 }, }, [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, }, [X264_CSP_BGRA] = { 1, { 256*4 }, { 256*1 }, }, [X264_CSP_RGB] = { 1, { 256*3 }, { 256*1 }, }, }; int csp = i_csp & X264_CSP_MASK; if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 ) return -1; x264_picture_init( pic ); pic->img.i_csp = i_csp; pic->img.i_plane = csp_tab[csp].planes; int depth_factor = i_csp & X264_CSP_HIGH_DEPTH ? 2 : 1; int64_t plane_offset[3] = {0}; int64_t frame_size = 0; for( int i = 0; i < pic->img.i_plane; i++ ) { int stride = (((int64_t)i_width * csp_tab[csp].width_fix8[i]) >> 8) * depth_factor; int64_t plane_size = (((int64_t)i_height * csp_tab[csp].height_fix8[i]) >> 8) * stride; pic->img.i_stride[i] = stride; plane_offset[i] = frame_size; frame_size += plane_size; } pic->img.plane[0] = x264_malloc( frame_size ); if( !pic->img.plane[0] ) return -1; for( int i = 1; i < pic->img.i_plane; i++ ) pic->img.plane[i] = pic->img.plane[0] + plane_offset[i]; return 0; } /**************************************************************************** * x264_picture_clean: ****************************************************************************/ REALIGN_STACK void x264_picture_clean( x264_picture_t *pic ) { x264_free( pic->img.plane[0] ); /* just to be safe */ memset( pic, 0, sizeof( x264_picture_t ) ); } /**************************************************************************** * x264_param_default: ****************************************************************************/ REALIGN_STACK void x264_param_default( x264_param_t *param ) { /* */ memset( param, 0, sizeof( x264_param_t ) ); /* CPU autodetect */ param->cpu = x264_cpu_detect(); param->i_threads = X264_THREADS_AUTO; param->i_lookahead_threads = X264_THREADS_AUTO; param->b_deterministic = 1; param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO; /* Video properties */ param->i_csp = X264_CHROMA_FORMAT ? X264_CHROMA_FORMAT : X264_CSP_I420; param->i_width = 0; param->i_height = 0; param->vui.i_sar_width = 0; param->vui.i_sar_height= 0; param->vui.i_overscan = 0; /* undef */ param->vui.i_vidformat = 5; /* undef */ param->vui.b_fullrange = -1; /* default depends on input */ param->vui.i_colorprim = 2; /* undef */ param->vui.i_transfer = 2; /* undef */ param->vui.i_colmatrix = -1; /* default depends on input */ param->vui.i_chroma_loc= 0; /* left center */ param->i_fps_num = 25; param->i_fps_den = 1; param->i_level_idc = -1; param->i_slice_max_size = 0; param->i_slice_max_mbs = 0; param->i_slice_count = 0; #if HAVE_BITDEPTH8 param->i_bitdepth = 8; #elif HAVE_BITDEPTH10 param->i_bitdepth = 10; #else param->i_bitdepth = 8; #endif /* Encoder parameters */ param->i_frame_reference = 3; param->i_keyint_max = 250; param->i_keyint_min = X264_KEYINT_MIN_AUTO; param->i_bframe = 3; param->i_scenecut_threshold = 40; param->i_bframe_adaptive = X264_B_ADAPT_FAST; param->i_bframe_bias = 0; param->i_bframe_pyramid = X264_B_PYRAMID_NORMAL; param->b_interlaced = 0; param->b_constrained_intra = 0; param->b_deblocking_filter = 1; param->i_deblocking_filter_alphac0 = 0; param->i_deblocking_filter_beta = 0; param->b_cabac = 1; param->i_cabac_init_idc = 0; param->rc.i_rc_method = X264_RC_CRF; param->rc.i_bitrate = 0; param->rc.f_rate_tolerance = 1.0; param->rc.i_vbv_max_bitrate = 0; param->rc.i_vbv_buffer_size = 0; param->rc.f_vbv_buffer_init = 0.9; param->rc.i_qp_constant = -1; param->rc.f_rf_constant = 23; param->rc.i_qp_min = 0; param->rc.i_qp_max = INT_MAX; param->rc.i_qp_step = 4; param->rc.f_ip_factor = 1.4; param->rc.f_pb_factor = 1.3; param->rc.i_aq_mode = X264_AQ_VARIANCE; param->rc.f_aq_strength = 1.0; param->rc.i_lookahead = 40; param->rc.b_stat_write = 0; param->rc.psz_stat_out = "x264_2pass.log"; param->rc.b_stat_read = 0; param->rc.psz_stat_in = "x264_2pass.log"; param->rc.f_qcompress = 0.6; param->rc.f_qblur = 0.5; param->rc.f_complexity_blur = 20; param->rc.i_zones = 0; param->rc.b_mb_tree = 1; /* Log */ param->pf_log = x264_log_default; param->p_log_private = NULL; param->i_log_level = X264_LOG_INFO; /* */ param->analyse.intra = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8; param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; param->analyse.i_me_method = X264_ME_HEX; param->analyse.f_psy_rd = 1.0; param->analyse.b_psy = 1; param->analyse.f_psy_trellis = 0; param->analyse.i_me_range = 16; param->analyse.i_subpel_refine = 7; param->analyse.b_mixed_references = 1; param->analyse.b_chroma_me = 1; param->analyse.i_mv_range_thread = -1; param->analyse.i_mv_range = -1; // set from level_idc param->analyse.i_chroma_qp_offset = 0; param->analyse.b_fast_pskip = 1; param->analyse.b_weighted_bipred = 1; param->analyse.i_weighted_pred = X264_WEIGHTP_SMART; param->analyse.b_dct_decimate = 1; param->analyse.b_transform_8x8 = 1; param->analyse.i_trellis = 1; param->analyse.i_luma_deadzone[0] = 21; param->analyse.i_luma_deadzone[1] = 11; param->analyse.b_psnr = 0; param->analyse.b_ssim = 0; param->i_cqm_preset = X264_CQM_FLAT; memset( param->cqm_4iy, 16, sizeof( param->cqm_4iy ) ); memset( param->cqm_4py, 16, sizeof( param->cqm_4py ) ); memset( param->cqm_4ic, 16, sizeof( param->cqm_4ic ) ); memset( param->cqm_4pc, 16, sizeof( param->cqm_4pc ) ); memset( param->cqm_8iy, 16, sizeof( param->cqm_8iy ) ); memset( param->cqm_8py, 16, sizeof( param->cqm_8py ) ); memset( param->cqm_8ic, 16, sizeof( param->cqm_8ic ) ); memset( param->cqm_8pc, 16, sizeof( param->cqm_8pc ) ); param->b_repeat_headers = 1; param->b_annexb = 1; param->b_aud = 0; param->b_vfr_input = 1; param->i_nal_hrd = X264_NAL_HRD_NONE; param->b_tff = 1; param->b_pic_struct = 0; param->b_fake_interlaced = 0; param->i_frame_packing = -1; param->i_alternative_transfer = 2; /* undef */ param->b_opencl = 0; param->i_opencl_device = 0; param->opencl_device_id = NULL; param->psz_clbin_file = NULL; param->i_avcintra_class = 0; param->i_avcintra_flavor = X264_AVCINTRA_FLAVOR_PANASONIC; } static int param_apply_preset( x264_param_t *param, const char *preset ) { char *end; int i = strtol( preset, &end, 10 ); if( *end == 0 && i >= 0 && i < ARRAY_ELEMS(x264_preset_names)-1 ) preset = x264_preset_names[i]; if( !strcasecmp( preset, "ultrafast" ) ) { param->i_frame_reference = 1; param->i_scenecut_threshold = 0; param->b_deblocking_filter = 0; param->b_cabac = 0; param->i_bframe = 0; param->analyse.intra = 0; param->analyse.inter = 0; param->analyse.b_transform_8x8 = 0; param->analyse.i_me_method = X264_ME_DIA; param->analyse.i_subpel_refine = 0; param->rc.i_aq_mode = 0; param->analyse.b_mixed_references = 0; param->analyse.i_trellis = 0; param->i_bframe_adaptive = X264_B_ADAPT_NONE; param->rc.b_mb_tree = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_NONE; param->analyse.b_weighted_bipred = 0; param->rc.i_lookahead = 0; } else if( !strcasecmp( preset, "superfast" ) ) { param->analyse.inter = X264_ANALYSE_I8x8|X264_ANALYSE_I4x4; param->analyse.i_me_method = X264_ME_DIA; param->analyse.i_subpel_refine = 1; param->i_frame_reference = 1; param->analyse.b_mixed_references = 0; param->analyse.i_trellis = 0; param->rc.b_mb_tree = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE; param->rc.i_lookahead = 0; } else if( !strcasecmp( preset, "veryfast" ) ) { param->analyse.i_subpel_refine = 2; param->i_frame_reference = 1; param->analyse.b_mixed_references = 0; param->analyse.i_trellis = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE; param->rc.i_lookahead = 10; } else if( !strcasecmp( preset, "faster" ) ) { param->analyse.b_mixed_references = 0; param->i_frame_reference = 2; param->analyse.i_subpel_refine = 4; param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE; param->rc.i_lookahead = 20; } else if( !strcasecmp( preset, "fast" ) ) { param->i_frame_reference = 2; param->analyse.i_subpel_refine = 6; param->analyse.i_weighted_pred = X264_WEIGHTP_SIMPLE; param->rc.i_lookahead = 30; } else if( !strcasecmp( preset, "medium" ) ) { /* Default is medium */ } else if( !strcasecmp( preset, "slow" ) ) { param->analyse.i_subpel_refine = 8; param->i_frame_reference = 5; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO; param->analyse.i_trellis = 2; param->rc.i_lookahead = 50; } else if( !strcasecmp( preset, "slower" ) ) { param->analyse.i_me_method = X264_ME_UMH; param->analyse.i_subpel_refine = 9; param->i_frame_reference = 8; param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO; param->analyse.inter |= X264_ANALYSE_PSUB8x8; param->analyse.i_trellis = 2; param->rc.i_lookahead = 60; } else if( !strcasecmp( preset, "veryslow" ) ) { param->analyse.i_me_method = X264_ME_UMH; param->analyse.i_subpel_refine = 10; param->analyse.i_me_range = 24; param->i_frame_reference = 16; param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO; param->analyse.inter |= X264_ANALYSE_PSUB8x8; param->analyse.i_trellis = 2; param->i_bframe = 8; param->rc.i_lookahead = 60; } else if( !strcasecmp( preset, "placebo" ) ) { param->analyse.i_me_method = X264_ME_TESA; param->analyse.i_subpel_refine = 11; param->analyse.i_me_range = 24; param->i_frame_reference = 16; param->i_bframe_adaptive = X264_B_ADAPT_TRELLIS; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_AUTO; param->analyse.inter |= X264_ANALYSE_PSUB8x8; param->analyse.b_fast_pskip = 0; param->analyse.i_trellis = 2; param->i_bframe = 16; param->rc.i_lookahead = 60; } else { x264_log_internal( X264_LOG_ERROR, "invalid preset '%s'\n", preset ); return -1; } return 0; } static int param_apply_tune( x264_param_t *param, const char *tune ) { int psy_tuning_used = 0; for( int len; tune += strspn( tune, ",./-+" ), (len = strcspn( tune, ",./-+" )); tune += len ) { if( len == 4 && !strncasecmp( tune, "film", 4 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->i_deblocking_filter_alphac0 = -1; param->i_deblocking_filter_beta = -1; param->analyse.f_psy_trellis = 0.15; } else if( len == 9 && !strncasecmp( tune, "animation", 9 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1; param->i_deblocking_filter_alphac0 = 1; param->i_deblocking_filter_beta = 1; param->analyse.f_psy_rd = 0.4; param->rc.f_aq_strength = 0.6; param->i_bframe += 2; } else if( len == 5 && !strncasecmp( tune, "grain", 5 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->i_deblocking_filter_alphac0 = -2; param->i_deblocking_filter_beta = -2; param->analyse.f_psy_trellis = 0.25; param->analyse.b_dct_decimate = 0; param->rc.f_pb_factor = 1.1; param->rc.f_ip_factor = 1.1; param->rc.f_aq_strength = 0.5; param->analyse.i_luma_deadzone[0] = 6; param->analyse.i_luma_deadzone[1] = 6; param->rc.f_qcompress = 0.8; } else if( len == 10 && !strncasecmp( tune, "stillimage", 10 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->i_deblocking_filter_alphac0 = -3; param->i_deblocking_filter_beta = -3; param->analyse.f_psy_rd = 2.0; param->analyse.f_psy_trellis = 0.7; param->rc.f_aq_strength = 1.2; } else if( len == 4 && !strncasecmp( tune, "psnr", 4 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->rc.i_aq_mode = X264_AQ_NONE; param->analyse.b_psy = 0; } else if( len == 4 && !strncasecmp( tune, "ssim", 4 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->rc.i_aq_mode = X264_AQ_AUTOVARIANCE; param->analyse.b_psy = 0; } else if( len == 10 && !strncasecmp( tune, "fastdecode", 10 ) ) { param->b_deblocking_filter = 0; param->b_cabac = 0; param->analyse.b_weighted_bipred = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_NONE; } else if( len == 11 && !strncasecmp( tune, "zerolatency", 11 ) ) { param->rc.i_lookahead = 0; param->i_sync_lookahead = 0; param->i_bframe = 0; param->b_sliced_threads = 1; param->b_vfr_input = 0; param->rc.b_mb_tree = 0; } else if( len == 6 && !strncasecmp( tune, "touhou", 6 ) ) { if( psy_tuning_used++ ) goto psy_failure; param->i_frame_reference = param->i_frame_reference > 1 ? param->i_frame_reference*2 : 1; param->i_deblocking_filter_alphac0 = -1; param->i_deblocking_filter_beta = -1; param->analyse.f_psy_trellis = 0.2; param->rc.f_aq_strength = 1.3; if( param->analyse.inter & X264_ANALYSE_PSUB16x16 ) param->analyse.inter |= X264_ANALYSE_PSUB8x8; } else { x264_log_internal( X264_LOG_ERROR, "invalid tune '%.*s'\n", len, tune ); return -1; psy_failure: x264_log_internal( X264_LOG_WARNING, "only 1 psy tuning can be used: ignoring tune %.*s\n", len, tune ); } } return 0; } REALIGN_STACK int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune ) { x264_param_default( param ); if( preset && param_apply_preset( param, preset ) < 0 ) return -1; if( tune && param_apply_tune( param, tune ) < 0 ) return -1; return 0; } REALIGN_STACK void x264_param_apply_fastfirstpass( x264_param_t *param ) { /* Set faster options in case of turbo firstpass. */ if( param->rc.b_stat_write && !param->rc.b_stat_read ) { param->i_frame_reference = 1; param->analyse.b_transform_8x8 = 0; param->analyse.inter = 0; param->analyse.i_me_method = X264_ME_DIA; param->analyse.i_subpel_refine = X264_MIN( 2, param->analyse.i_subpel_refine ); param->analyse.i_trellis = 0; param->analyse.b_fast_pskip = 1; } } static int profile_string_to_int( const char *str ) { if( !strcasecmp( str, "baseline" ) ) return PROFILE_BASELINE; if( !strcasecmp( str, "main" ) ) return PROFILE_MAIN; if( !strcasecmp( str, "high" ) ) return PROFILE_HIGH; if( !strcasecmp( str, "high10" ) ) return PROFILE_HIGH10; if( !strcasecmp( str, "high422" ) ) return PROFILE_HIGH422; if( !strcasecmp( str, "high444" ) ) return PROFILE_HIGH444_PREDICTIVE; return -1; } REALIGN_STACK int x264_param_apply_profile( x264_param_t *param, const char *profile ) { if( !profile ) return 0; const int qp_bd_offset = 6 * (param->i_bitdepth-8); int p = profile_string_to_int( profile ); if( p < 0 ) { x264_log_internal( X264_LOG_ERROR, "invalid profile: %s\n", profile ); return -1; } if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) || (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + qp_bd_offset) <= 0)) ) { x264_log_internal( X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile ); return -1; } if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 ) { x264_log_internal( X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile ); return -1; } if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 ) { x264_log_internal( X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile ); return -1; } if( p < PROFILE_HIGH10 && param->i_bitdepth > 8 ) { x264_log_internal( X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, param->i_bitdepth ); return -1; } if( p < PROFILE_HIGH && (param->i_csp & X264_CSP_MASK) == X264_CSP_I400 ) { x264_log_internal( X264_LOG_ERROR, "%s profile doesn't support 4:0:0\n", profile ); return -1; } if( p == PROFILE_BASELINE ) { param->analyse.b_transform_8x8 = 0; param->b_cabac = 0; param->i_cqm_preset = X264_CQM_FLAT; param->psz_cqm_file = NULL; param->i_bframe = 0; param->analyse.i_weighted_pred = X264_WEIGHTP_NONE; if( param->b_interlaced ) { x264_log_internal( X264_LOG_ERROR, "baseline profile doesn't support interlacing\n" ); return -1; } if( param->b_fake_interlaced ) { x264_log_internal( X264_LOG_ERROR, "baseline profile doesn't support fake interlacing\n" ); return -1; } } else if( p == PROFILE_MAIN ) { param->analyse.b_transform_8x8 = 0; param->i_cqm_preset = X264_CQM_FLAT; param->psz_cqm_file = NULL; } return 0; } static int parse_enum( const char *arg, const char * const *names, int *dst ) { for( int i = 0; names[i]; i++ ) if( *names[i] && !strcasecmp( arg, names[i] ) ) { *dst = i; return 0; } return -1; } static int parse_cqm( const char *str, uint8_t *cqm, int length ) { int i = 0; do { int coef; if( !sscanf( str, "%d", &coef ) || coef < 1 || coef > 255 ) return -1; cqm[i++] = coef; } while( i < length && (str = strchr( str, ',' )) && str++ ); return (i == length) ? 0 : -1; } static int atobool_internal( const char *str, int *b_error ) { if( !strcmp(str, "1") || !strcasecmp(str, "true") || !strcasecmp(str, "yes") ) return 1; if( !strcmp(str, "0") || !strcasecmp(str, "false") || !strcasecmp(str, "no") ) return 0; *b_error = 1; return 0; } static int atoi_internal( const char *str, int *b_error ) { char *end; int v = strtol( str, &end, 0 ); if( end == str || *end != '\0' ) *b_error = 1; return v; } static double atof_internal( const char *str, int *b_error ) { char *end; double v = strtod( str, &end ); if( end == str || *end != '\0' ) *b_error = 1; return v; } #define atobool(str) ( name_was_bool = 1, atobool_internal( str, &b_error ) ) #undef atoi #undef atof #define atoi(str) atoi_internal( str, &b_error ) #define atof(str) atof_internal( str, &b_error ) #define CHECKED_ERROR_PARAM_STRDUP( var, param, src )\ do {\ var = x264_param_strdup( param, src );\ if( !var )\ {\ b_error = 1;\ errortype = X264_PARAM_ALLOC_FAILED;\ }\ } while( 0 ) REALIGN_STACK int x264_param_parse( x264_param_t *p, const char *name, const char *value ) { char *name_buf = NULL; int b_error = 0; int errortype = X264_PARAM_BAD_VALUE; int name_was_bool; int value_was_null = !value; if( !name ) return X264_PARAM_BAD_NAME; if( !value ) value = "true"; if( value[0] == '=' ) value++; if( strchr( name, '_' ) ) // s/_/-/g { char *c; name_buf = strdup(name); if( !name_buf ) return X264_PARAM_ALLOC_FAILED; while( (c = strchr( name_buf, '_' )) ) *c = '-'; name = name_buf; } if( !strncmp( name, "no", 2 ) ) { name += 2; if( name[0] == '-' ) name++; value = atobool(value) ? "false" : "true"; } name_was_bool = 0; #define OPT(STR) else if( !strcmp( name, STR ) ) #define OPT2(STR0, STR1) else if( !strcmp( name, STR0 ) || !strcmp( name, STR1 ) ) if( 0 ); OPT("asm") { p->cpu = X264_ISDIGIT(value[0]) ? (uint32_t)atoi(value) : !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0; if( b_error ) { char *buf = strdup( value ); if( buf ) { char *tok, UNUSED *saveptr=NULL, *init; b_error = 0; p->cpu = 0; for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL ) { int i = 0; while( x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name) ) i++; p->cpu |= x264_cpu_names[i].flags; if( !x264_cpu_names[i].flags ) b_error = 1; } free( buf ); if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) ) p->cpu |= X264_CPU_SSE2_IS_FAST; } else errortype = X264_PARAM_ALLOC_FAILED; } } OPT("threads") { if( !strcasecmp(value, "auto") ) p->i_threads = X264_THREADS_AUTO; else p->i_threads = atoi(value); } OPT("lookahead-threads") { if( !strcasecmp(value, "auto") ) p->i_lookahead_threads = X264_THREADS_AUTO; else p->i_lookahead_threads = atoi(value); } OPT("sliced-threads") p->b_sliced_threads = atobool(value); OPT("sync-lookahead") { if( !strcasecmp(value, "auto") ) p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO; else p->i_sync_lookahead = atoi(value); } OPT2("deterministic", "n-deterministic") p->b_deterministic = atobool(value); OPT("cpu-independent") p->b_cpu_independent = atobool(value); OPT2("level", "level-idc") { if( !strcmp(value, "1b") ) p->i_level_idc = 9; else if( atof(value) < 7 ) p->i_level_idc = (int)(10*atof(value)+.5); else p->i_level_idc = atoi(value); } OPT("bluray-compat") p->b_bluray_compat = atobool(value); OPT("avcintra-class") p->i_avcintra_class = atoi(value); OPT("avcintra-flavor") b_error |= parse_enum( value, x264_avcintra_flavor_names, &p->i_avcintra_flavor ); OPT("sar") { b_error |= ( 2 != sscanf( value, "%d:%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) && 2 != sscanf( value, "%d/%d", &p->vui.i_sar_width, &p->vui.i_sar_height ) ); } OPT("overscan") b_error |= parse_enum( value, x264_overscan_names, &p->vui.i_overscan ); OPT("videoformat") b_error |= parse_enum( value, x264_vidformat_names, &p->vui.i_vidformat ); OPT("fullrange") b_error |= parse_enum( value, x264_fullrange_names, &p->vui.b_fullrange ); OPT("colorprim") b_error |= parse_enum( value, x264_colorprim_names, &p->vui.i_colorprim ); OPT("transfer") b_error |= parse_enum( value, x264_transfer_names, &p->vui.i_transfer ); OPT("colormatrix") b_error |= parse_enum( value, x264_colmatrix_names, &p->vui.i_colmatrix ); OPT("chromaloc") { p->vui.i_chroma_loc = atoi(value); b_error |= ( p->vui.i_chroma_loc < 0 || p->vui.i_chroma_loc > 5 ); } OPT("mastering-display") { if( strcasecmp( value, "undef" ) ) { b_error |= sscanf( value, "G(%d,%d)B(%d,%d)R(%d,%d)WP(%d,%d)L(%"SCNd64",%"SCNd64")", &p->mastering_display.i_green_x, &p->mastering_display.i_green_y, &p->mastering_display.i_blue_x, &p->mastering_display.i_blue_y, &p->mastering_display.i_red_x, &p->mastering_display.i_red_y, &p->mastering_display.i_white_x, &p->mastering_display.i_white_y, &p->mastering_display.i_display_max, &p->mastering_display.i_display_min ) != 10; p->mastering_display.b_mastering_display = !b_error; } else p->mastering_display.b_mastering_display = 0; } OPT("cll") { if( strcasecmp( value, "undef" ) ) { b_error |= sscanf( value, "%d,%d", &p->content_light_level.i_max_cll, &p->content_light_level.i_max_fall ) != 2; p->content_light_level.b_cll = !b_error; } else p->content_light_level.b_cll = 0; } OPT("alternative-transfer") b_error |= parse_enum( value, x264_transfer_names, &p->i_alternative_transfer ); OPT("fps") { int64_t i_fps_num; int64_t i_fps_den; if( sscanf( value, "%"SCNd64"/%"SCNd64, &i_fps_num, &i_fps_den ) == 2 ) { p->i_fps_num = i_fps_num; p->i_fps_den = i_fps_den; b_error |= i_fps_num < 1 || i_fps_num > UINT32_MAX || i_fps_den < 1 || i_fps_den > UINT32_MAX; } else { double fps = atof(value); if( fps < 0.0005 || fps > INT_MAX ) b_error = 1; else if( fps <= INT_MAX/1000.0 ) { p->i_fps_num = (int)(fps * 1000.0 + .5); p->i_fps_den = 1000; } else { p->i_fps_num = atoi(value); p->i_fps_den = 1; } } } OPT2("ref", "frameref") p->i_frame_reference = atoi(value); OPT("dpb-size") p->i_dpb_size = atoi(value); OPT("keyint") { if( strstr( value, "infinite" ) ) p->i_keyint_max = X264_KEYINT_MAX_INFINITE; else p->i_keyint_max = atoi(value); } OPT2("min-keyint", "keyint-min") { p->i_keyint_min = atoi(value); if( p->i_keyint_max < p->i_keyint_min ) p->i_keyint_max = p->i_keyint_min; } OPT("scenecut") { p->i_scenecut_threshold = atobool(value); if( b_error || p->i_scenecut_threshold ) { b_error = 0; p->i_scenecut_threshold = atoi(value); } } OPT("intra-refresh") p->b_intra_refresh = atobool(value); OPT("bframes") p->i_bframe = atoi(value); OPT("b-adapt") { p->i_bframe_adaptive = atobool(value); if( b_error ) { b_error = 0; p->i_bframe_adaptive = atoi(value); } } OPT("b-bias") p->i_bframe_bias = atoi(value); OPT("b-pyramid") { b_error |= parse_enum( value, x264_b_pyramid_names, &p->i_bframe_pyramid ); if( b_error ) { b_error = 0; p->i_bframe_pyramid = atoi(value); } } OPT("open-gop") p->b_open_gop = atobool(value); OPT("nf") p->b_deblocking_filter = !atobool(value); OPT2("filter", "deblock") { if( 2 == sscanf( value, "%d:%d", &p->i_deblocking_filter_alphac0, &p->i_deblocking_filter_beta ) || 2 == sscanf( value, "%d,%d", &p->i_deblocking_filter_alphac0, &p->i_deblocking_filter_beta ) ) { p->b_deblocking_filter = 1; } else if( sscanf( value, "%d", &p->i_deblocking_filter_alphac0 ) ) { p->b_deblocking_filter = 1; p->i_deblocking_filter_beta = p->i_deblocking_filter_alphac0; } else p->b_deblocking_filter = atobool(value); } OPT("slice-max-size") p->i_slice_max_size = atoi(value); OPT("slice-max-mbs") p->i_slice_max_mbs = atoi(value); OPT("slice-min-mbs") p->i_slice_min_mbs = atoi(value); OPT("slices") p->i_slice_count = atoi(value); OPT("slices-max") p->i_slice_count_max = atoi(value); OPT("cabac") p->b_cabac = atobool(value); OPT("cabac-idc") p->i_cabac_init_idc = atoi(value); OPT("interlaced") p->b_interlaced = atobool(value); OPT("tff") p->b_interlaced = p->b_tff = atobool(value); OPT("bff") { p->b_interlaced = atobool(value); p->b_tff = !p->b_interlaced; } OPT("constrained-intra") p->b_constrained_intra = atobool(value); OPT("cqm") { if( strstr( value, "flat" ) ) p->i_cqm_preset = X264_CQM_FLAT; else if( strstr( value, "jvt" ) ) p->i_cqm_preset = X264_CQM_JVT; else CHECKED_ERROR_PARAM_STRDUP( p->psz_cqm_file, p, value ); } OPT("cqmfile") CHECKED_ERROR_PARAM_STRDUP( p->psz_cqm_file, p, value ); OPT("cqm4") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4iy, 16 ); b_error |= parse_cqm( value, p->cqm_4py, 16 ); b_error |= parse_cqm( value, p->cqm_4ic, 16 ); b_error |= parse_cqm( value, p->cqm_4pc, 16 ); } OPT("cqm8") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_8iy, 64 ); b_error |= parse_cqm( value, p->cqm_8py, 64 ); b_error |= parse_cqm( value, p->cqm_8ic, 64 ); b_error |= parse_cqm( value, p->cqm_8pc, 64 ); } OPT("cqm4i") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4iy, 16 ); b_error |= parse_cqm( value, p->cqm_4ic, 16 ); } OPT("cqm4p") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4py, 16 ); b_error |= parse_cqm( value, p->cqm_4pc, 16 ); } OPT("cqm4iy") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4iy, 16 ); } OPT("cqm4ic") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4ic, 16 ); } OPT("cqm4py") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4py, 16 ); } OPT("cqm4pc") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_4pc, 16 ); } OPT("cqm8i") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_8iy, 64 ); b_error |= parse_cqm( value, p->cqm_8ic, 64 ); } OPT("cqm8p") { p->i_cqm_preset = X264_CQM_CUSTOM; b_error |= parse_cqm( value, p->cqm_8py, 64 ); b_error |= parse_cqm( value, p->cqm_8pc, 64 ); } OPT("log") p->i_log_level = atoi(value); OPT("dump-yuv") CHECKED_ERROR_PARAM_STRDUP( p->psz_dump_yuv, p, value ); OPT2("analyse", "partitions") { p->analyse.inter = 0; if( strstr( value, "none" ) ) p->analyse.inter = 0; if( strstr( value, "all" ) ) p->analyse.inter = ~0; if( strstr( value, "i4x4" ) ) p->analyse.inter |= X264_ANALYSE_I4x4; if( strstr( value, "i8x8" ) ) p->analyse.inter |= X264_ANALYSE_I8x8; if( strstr( value, "p8x8" ) ) p->analyse.inter |= X264_ANALYSE_PSUB16x16; if( strstr( value, "p4x4" ) ) p->analyse.inter |= X264_ANALYSE_PSUB8x8; if( strstr( value, "b8x8" ) ) p->analyse.inter |= X264_ANALYSE_BSUB16x16; } OPT("8x8dct") p->analyse.b_transform_8x8 = atobool(value); OPT2("weightb", "weight-b") p->analyse.b_weighted_bipred = atobool(value); OPT("weightp") p->analyse.i_weighted_pred = atoi(value); OPT2("direct", "direct-pred") b_error |= parse_enum( value, x264_direct_pred_names, &p->analyse.i_direct_mv_pred ); OPT("chroma-qp-offset") p->analyse.i_chroma_qp_offset = atoi(value); OPT("me") b_error |= parse_enum( value, x264_motion_est_names, &p->analyse.i_me_method ); OPT2("merange", "me-range") p->analyse.i_me_range = atoi(value); OPT2("mvrange", "mv-range") p->analyse.i_mv_range = atoi(value); OPT2("mvrange-thread", "mv-range-thread") p->analyse.i_mv_range_thread = atoi(value); OPT2("subme", "subq") p->analyse.i_subpel_refine = atoi(value); OPT("psy-rd") { if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) || 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) || 2 == sscanf( value, "%f|%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis )) { } else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) ) { p->analyse.f_psy_trellis = 0; } else { p->analyse.f_psy_rd = 0; p->analyse.f_psy_trellis = 0; } } OPT("psy") p->analyse.b_psy = atobool(value); OPT("chroma-me") p->analyse.b_chroma_me = atobool(value); OPT("mixed-refs") p->analyse.b_mixed_references = atobool(value); OPT("trellis") p->analyse.i_trellis = atoi(value); OPT("fast-pskip") p->analyse.b_fast_pskip = atobool(value); OPT("dct-decimate") p->analyse.b_dct_decimate = atobool(value); OPT("deadzone-inter") p->analyse.i_luma_deadzone[0] = atoi(value); OPT("deadzone-intra") p->analyse.i_luma_deadzone[1] = atoi(value); OPT("nr") p->analyse.i_noise_reduction = atoi(value); OPT("bitrate") { p->rc.i_bitrate = atoi(value); p->rc.i_rc_method = X264_RC_ABR; } OPT2("qp", "qp_constant") { p->rc.i_qp_constant = atoi(value); p->rc.i_rc_method = X264_RC_CQP; } OPT("crf") { p->rc.f_rf_constant = atof(value); p->rc.i_rc_method = X264_RC_CRF; } OPT("crf-max") p->rc.f_rf_constant_max = atof(value); OPT("rc-lookahead") p->rc.i_lookahead = atoi(value); OPT2("qpmin", "qp-min") p->rc.i_qp_min = atoi(value); OPT2("qpmax", "qp-max") p->rc.i_qp_max = atoi(value); OPT2("qpstep", "qp-step") p->rc.i_qp_step = atoi(value); OPT("ratetol") p->rc.f_rate_tolerance = !strncmp("inf", value, 3) ? 1e9 : atof(value); OPT("vbv-maxrate") p->rc.i_vbv_max_bitrate = atoi(value); OPT("vbv-bufsize") p->rc.i_vbv_buffer_size = atoi(value); OPT("vbv-init") p->rc.f_vbv_buffer_init = atof(value); OPT2("ipratio", "ip-factor") p->rc.f_ip_factor = atof(value); OPT2("pbratio", "pb-factor") p->rc.f_pb_factor = atof(value); OPT("aq-mode") p->rc.i_aq_mode = atoi(value); OPT("aq-strength") p->rc.f_aq_strength = atof(value); OPT("pass") { int pass = x264_clip3( atoi(value), 0, 3 ); p->rc.b_stat_write = pass & 1; p->rc.b_stat_read = pass & 2; } OPT("stats") { CHECKED_ERROR_PARAM_STRDUP( p->rc.psz_stat_in, p, value ); CHECKED_ERROR_PARAM_STRDUP( p->rc.psz_stat_out, p, value ); } OPT("qcomp") p->rc.f_qcompress = atof(value); OPT("mbtree") p->rc.b_mb_tree = atobool(value); OPT("qblur") p->rc.f_qblur = atof(value); OPT2("cplxblur", "cplx-blur") p->rc.f_complexity_blur = atof(value); OPT("zones") CHECKED_ERROR_PARAM_STRDUP( p->rc.psz_zones, p, value ); OPT("crop-rect") b_error |= sscanf( value, "%d,%d,%d,%d", &p->crop_rect.i_left, &p->crop_rect.i_top, &p->crop_rect.i_right, &p->crop_rect.i_bottom ) != 4; OPT("psnr") p->analyse.b_psnr = atobool(value); OPT("ssim") p->analyse.b_ssim = atobool(value); OPT("aud") p->b_aud = atobool(value); OPT("sps-id") p->i_sps_id = atoi(value); OPT("global-header") p->b_repeat_headers = !atobool(value); OPT("repeat-headers") p->b_repeat_headers = atobool(value); OPT("annexb") p->b_annexb = atobool(value); OPT("force-cfr") p->b_vfr_input = !atobool(value); OPT("nal-hrd") b_error |= parse_enum( value, x264_nal_hrd_names, &p->i_nal_hrd ); OPT("filler") p->rc.b_filler = atobool(value); OPT("pic-struct") p->b_pic_struct = atobool(value); OPT("fake-interlaced") p->b_fake_interlaced = atobool(value); OPT("frame-packing") p->i_frame_packing = atoi(value); OPT("stitchable") p->b_stitchable = atobool(value); OPT("opencl") p->b_opencl = atobool( value ); OPT("opencl-clbin") CHECKED_ERROR_PARAM_STRDUP( p->psz_clbin_file, p, value ); OPT("opencl-device") p->i_opencl_device = atoi( value ); else { b_error = 1; errortype = X264_PARAM_BAD_NAME; } #undef OPT #undef OPT2 #undef atobool #undef atoi #undef atof if( name_buf ) free( name_buf ); b_error |= value_was_null && !name_was_bool; return b_error ? errortype : 0; } /**************************************************************************** * x264_param2string: ****************************************************************************/ char *x264_param2string( x264_param_t *p, int b_res ) { int len = 2000; char *buf, *s; if( p->rc.psz_zones ) len += strlen(p->rc.psz_zones); buf = s = x264_malloc( len ); if( !buf ) return NULL; if( b_res ) { s += sprintf( s, "%dx%d ", p->i_width, p->i_height ); s += sprintf( s, "fps=%u/%u ", p->i_fps_num, p->i_fps_den ); s += sprintf( s, "timebase=%u/%u ", p->i_timebase_num, p->i_timebase_den ); s += sprintf( s, "bitdepth=%d ", p->i_bitdepth ); } if( p->b_opencl ) s += sprintf( s, "opencl=%d ", p->b_opencl ); s += sprintf( s, "cabac=%d", p->b_cabac ); s += sprintf( s, " ref=%d", p->i_frame_reference ); s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter, p->i_deblocking_filter_alphac0, p->i_deblocking_filter_beta ); s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter ); s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] ); s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine ); s += sprintf( s, " psy=%d", p->analyse.b_psy ); if( p->analyse.b_psy ) s += sprintf( s, " psy_rd=%.2f:%.2f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis ); s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references ); s += sprintf( s, " me_range=%d", p->analyse.i_me_range ); s += sprintf( s, " chroma_me=%d", p->analyse.b_chroma_me ); s += sprintf( s, " trellis=%d", p->analyse.i_trellis ); s += sprintf( s, " 8x8dct=%d", p->analyse.b_transform_8x8 ); s += sprintf( s, " cqm=%d", p->i_cqm_preset ); s += sprintf( s, " deadzone=%d,%d", p->analyse.i_luma_deadzone[0], p->analyse.i_luma_deadzone[1] ); s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip ); s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset ); s += sprintf( s, " threads=%d", p->i_threads ); s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads ); s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads ); if( p->i_slice_count ) s += sprintf( s, " slices=%d", p->i_slice_count ); if( p->i_slice_count_max ) s += sprintf( s, " slices_max=%d", p->i_slice_count_max ); if( p->i_slice_max_size ) s += sprintf( s, " slice_max_size=%d", p->i_slice_max_size ); if( p->i_slice_max_mbs ) s += sprintf( s, " slice_max_mbs=%d", p->i_slice_max_mbs ); if( p->i_slice_min_mbs ) s += sprintf( s, " slice_min_mbs=%d", p->i_slice_min_mbs ); s += sprintf( s, " nr=%d", p->analyse.i_noise_reduction ); s += sprintf( s, " decimate=%d", p->analyse.b_dct_decimate ); s += sprintf( s, " interlaced=%s", p->b_interlaced ? p->b_tff ? "tff" : "bff" : p->b_fake_interlaced ? "fake" : "0" ); s += sprintf( s, " bluray_compat=%d", p->b_bluray_compat ); if( p->b_stitchable ) s += sprintf( s, " stitchable=%d", p->b_stitchable ); s += sprintf( s, " constrained_intra=%d", p->b_constrained_intra ); s += sprintf( s, " bframes=%d", p->i_bframe ); if( p->i_bframe ) { s += sprintf( s, " b_pyramid=%d b_adapt=%d b_bias=%d direct=%d weightb=%d open_gop=%d", p->i_bframe_pyramid, p->i_bframe_adaptive, p->i_bframe_bias, p->analyse.i_direct_mv_pred, p->analyse.b_weighted_bipred, p->b_open_gop ); } s += sprintf( s, " weightp=%d", p->analyse.i_weighted_pred > 0 ? p->analyse.i_weighted_pred : 0 ); if( p->i_keyint_max == X264_KEYINT_MAX_INFINITE ) s += sprintf( s, " keyint=infinite" ); else s += sprintf( s, " keyint=%d", p->i_keyint_max ); s += sprintf( s, " keyint_min=%d scenecut=%d intra_refresh=%d", p->i_keyint_min, p->i_scenecut_threshold, p->b_intra_refresh ); if( p->rc.b_mb_tree || p->rc.i_vbv_buffer_size ) s += sprintf( s, " rc_lookahead=%d", p->rc.i_lookahead ); s += sprintf( s, " rc=%s mbtree=%d", p->rc.i_rc_method == X264_RC_ABR ? ( p->rc.b_stat_read ? "2pass" : p->rc.i_vbv_max_bitrate == p->rc.i_bitrate ? "cbr" : "abr" ) : p->rc.i_rc_method == X264_RC_CRF ? "crf" : "cqp", p->rc.b_mb_tree ); if( p->rc.i_rc_method == X264_RC_ABR || p->rc.i_rc_method == X264_RC_CRF ) { if( p->rc.i_rc_method == X264_RC_CRF ) s += sprintf( s, " crf=%.1f", p->rc.f_rf_constant ); else s += sprintf( s, " bitrate=%d ratetol=%.1f", p->rc.i_bitrate, p->rc.f_rate_tolerance ); s += sprintf( s, " qcomp=%.2f qpmin=%d qpmax=%d qpstep=%d", p->rc.f_qcompress, p->rc.i_qp_min, p->rc.i_qp_max, p->rc.i_qp_step ); if( p->rc.b_stat_read ) s += sprintf( s, " cplxblur=%.1f qblur=%.1f", p->rc.f_complexity_blur, p->rc.f_qblur ); if( p->rc.i_vbv_buffer_size ) { s += sprintf( s, " vbv_maxrate=%d vbv_bufsize=%d", p->rc.i_vbv_max_bitrate, p->rc.i_vbv_buffer_size ); if( p->rc.i_rc_method == X264_RC_CRF ) s += sprintf( s, " crf_max=%.1f", p->rc.f_rf_constant_max ); } } else if( p->rc.i_rc_method == X264_RC_CQP ) s += sprintf( s, " qp=%d", p->rc.i_qp_constant ); if( p->rc.i_vbv_buffer_size ) s += sprintf( s, " nal_hrd=%s filler=%d", x264_nal_hrd_names[p->i_nal_hrd], p->rc.b_filler ); if( p->crop_rect.i_left | p->crop_rect.i_top | p->crop_rect.i_right | p->crop_rect.i_bottom ) s += sprintf( s, " crop_rect=%d,%d,%d,%d", p->crop_rect.i_left, p->crop_rect.i_top, p->crop_rect.i_right, p->crop_rect.i_bottom ); if( p->mastering_display.b_mastering_display ) s += sprintf( s, " mastering-display=G(%d,%d)B(%d,%d)R(%d,%d)WP(%d,%d)L(%"PRId64",%"PRId64")", p->mastering_display.i_green_x, p->mastering_display.i_green_y, p->mastering_display.i_blue_x, p->mastering_display.i_blue_y, p->mastering_display.i_red_x, p->mastering_display.i_red_y, p->mastering_display.i_white_x, p->mastering_display.i_white_y, p->mastering_display.i_display_max, p->mastering_display.i_display_min ); if( p->content_light_level.b_cll ) s += sprintf( s, " cll=%d,%d", p->content_light_level.i_max_cll, p->content_light_level.i_max_fall ); if( p->i_frame_packing >= 0 ) s += sprintf( s, " frame-packing=%d", p->i_frame_packing ); if( !(p->rc.i_rc_method == X264_RC_CQP && p->rc.i_qp_constant == 0) ) { s += sprintf( s, " ip_ratio=%.2f", p->rc.f_ip_factor ); if( p->i_bframe && !p->rc.b_mb_tree ) s += sprintf( s, " pb_ratio=%.2f", p->rc.f_pb_factor ); s += sprintf( s, " aq=%d", p->rc.i_aq_mode ); if( p->rc.i_aq_mode ) s += sprintf( s, ":%.2f", p->rc.f_aq_strength ); if( p->rc.psz_zones ) s += sprintf( s, " zones=%s", p->rc.psz_zones ); else if( p->rc.i_zones ) s += sprintf( s, " zones" ); } return buf; } x264-master/common/base.h000066400000000000000000000264011502133446700154350ustar00rootroot00000000000000/***************************************************************************** * base.h: misc common functions (bit depth independent) ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_BASE_H #define X264_BASE_H /**************************************************************************** * Macros (can be used in osdep.h) ****************************************************************************/ #define X264_MIN(a,b) ( (a)<(b) ? (a) : (b) ) #define X264_MAX(a,b) ( (a)>(b) ? (a) : (b) ) #define X264_MIN3(a,b,c) X264_MIN((a),X264_MIN((b),(c))) #define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c))) #define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d))) #define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d))) /**************************************************************************** * System includes ****************************************************************************/ #include "osdep.h" #include #include #include #include #include #include /**************************************************************************** * Macros ****************************************************************************/ #define XCHG(type,a,b) do { type t = a; a = b; b = t; } while( 0 ) #define FIX8(f) ((int)(f*(1<<8)+.5)) #define ARRAY_ELEMS(a) ((int)((sizeof(a))/(sizeof(a[0])))) #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) #define IS_DISPOSABLE(type) ( type == X264_TYPE_B ) /* Unions for type-punning. * Mn: load or store n bits, aligned, native-endian * CPn: copy n bits, aligned, native-endian * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */ typedef union { uint16_t i; uint8_t b[2]; } MAY_ALIAS x264_union16_t; typedef union { uint32_t i; uint16_t w[2]; uint8_t b[4]; } MAY_ALIAS x264_union32_t; typedef union { uint64_t i; uint32_t d[2]; uint16_t w[4]; uint8_t b[8]; } MAY_ALIAS x264_union64_t; typedef struct { uint64_t i[2]; } x264_uint128_t; typedef union { x264_uint128_t i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_t; #define M16(src) (((x264_union16_t*)(src))->i) #define M32(src) (((x264_union32_t*)(src))->i) #define M64(src) (((x264_union64_t*)(src))->i) #define M128(src) (((x264_union128_t*)(src))->i) #define M128_ZERO ((x264_uint128_t){{0,0}}) #define CP16(dst,src) M16(dst) = M16(src) #define CP32(dst,src) M32(dst) = M32(src) #define CP64(dst,src) M64(dst) = M64(src) #define CP128(dst,src) M128(dst) = M128(src) /* Macros for memory constraints of inline asm */ #if defined(__GNUC__) && __GNUC__ >= 8 && !defined(__clang__) && !defined(__INTEL_COMPILER) #define MEM_FIX(x, t, s) (*(t (*)[s])(x)) #define MEM_DYN(x, t) (*(t (*)[])(x)) #else //older versions of gcc prefer casting to structure instead of array #define MEM_FIX(x, t, s) (*(struct { t a[s]; } MAY_ALIAS (*))(x)) //let's set an arbitrary large constant size #define MEM_DYN(x, t) MEM_FIX(x, t, 4096) #endif /**************************************************************************** * Constants ****************************************************************************/ enum profile_e { PROFILE_BASELINE = 66, PROFILE_MAIN = 77, PROFILE_HIGH = 100, PROFILE_HIGH10 = 110, PROFILE_HIGH422 = 122, PROFILE_HIGH444_PREDICTIVE = 244, }; enum chroma_format_e { CHROMA_400 = 0, CHROMA_420 = 1, CHROMA_422 = 2, CHROMA_444 = 3, }; enum slice_type_e { SLICE_TYPE_P = 0, SLICE_TYPE_B = 1, SLICE_TYPE_I = 2, }; static const char slice_type_to_char[] = { 'P', 'B', 'I' }; enum sei_payload_type_e { SEI_BUFFERING_PERIOD = 0, SEI_PIC_TIMING = 1, SEI_PAN_SCAN_RECT = 2, SEI_FILLER = 3, SEI_USER_DATA_REGISTERED = 4, SEI_USER_DATA_UNREGISTERED = 5, SEI_RECOVERY_POINT = 6, SEI_DEC_REF_PIC_MARKING = 7, SEI_FRAME_PACKING = 45, SEI_MASTERING_DISPLAY = 137, SEI_CONTENT_LIGHT_LEVEL = 144, SEI_ALTERNATIVE_TRANSFER = 147, }; #define X264_BFRAME_MAX 16 #define X264_REF_MAX 16 #define X264_THREAD_MAX 128 #define X264_LOOKAHEAD_THREAD_MAX 16 #define X264_LOOKAHEAD_MAX 250 // number of pixels (per thread) in progress at any given time. // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety #define X264_THREAD_HEIGHT 24 /* WEIGHTP_FAKE is set when mb_tree & psy are enabled, but normal weightp is disabled * (such as in baseline). It checks for fades in lookahead and adjusts qp accordingly * to increase quality. Defined as (-1) so that if(i_weighted_pred > 0) is true only when * real weights are being used. */ #define X264_WEIGHTP_FAKE (-1) #define X264_SCAN8_LUMA_SIZE (5*8) #define X264_SCAN8_SIZE (X264_SCAN8_LUMA_SIZE*3) #define X264_SCAN8_0 (4+1*8) /* Scan8 organization: * 0 1 2 3 4 5 6 7 * 0 DY y y y y y * 1 y Y Y Y Y * 2 y Y Y Y Y * 3 y Y Y Y Y * 4 y Y Y Y Y * 5 DU u u u u u * 6 u U U U U * 7 u U U U U * 8 u U U U U * 9 u U U U U * 10 DV v v v v v * 11 v V V V V * 12 v V V V V * 13 v V V V V * 14 v V V V V * DY/DU/DV are for luma/chroma DC. */ #define LUMA_DC 48 #define CHROMA_DC 49 static const uint8_t x264_scan8[16*3 + 3] = { 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8, 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8, 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8, 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8, 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8, 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8, 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8, 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8, 4+11*8, 5+11*8, 4+12*8, 5+12*8, 6+11*8, 7+11*8, 6+12*8, 7+12*8, 4+13*8, 5+13*8, 4+14*8, 5+14*8, 6+13*8, 7+13*8, 6+14*8, 7+14*8, 0+ 0*8, 0+ 5*8, 0+10*8 }; /**************************************************************************** * Includes ****************************************************************************/ #include "cpu.h" #include "tables.h" /**************************************************************************** * Inline functions ****************************************************************************/ static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max ) { return ( (v < i_min) ? i_min : (v > i_max) ? i_max : v ); } static ALWAYS_INLINE double x264_clip3f( double v, double f_min, double f_max ) { return ( (v < f_min) ? f_min : (v > f_max) ? f_max : v ); } /* Not a general-purpose function; multiplies input by -1/6 to convert * qp to qscale. */ static ALWAYS_INLINE int x264_exp2fix8( float x ) { int i = x*(-64.f/6.f) + 512.5f; if( i < 0 ) return 0; if( i > 1023 ) return 0xffff; return (x264_exp2_lut[i&63]+256) << (i>>6) >> 8; } static ALWAYS_INLINE float x264_log2( uint32_t x ) { int lz = x264_clz( x ); return x264_log2_lut[(x<>24)&0x7f] + x264_log2_lz_lut[lz]; } static ALWAYS_INLINE int x264_median( int a, int b, int c ) { int t = (a-b)&((a-b)>>31); a -= t; b += t; b -= (b-c)&((b-c)>>31); b += (a-b)&((a-b)>>31); return b; } static ALWAYS_INLINE void x264_median_mv( int16_t *dst, int16_t *a, int16_t *b, int16_t *c ) { dst[0] = x264_median( a[0], b[0], c[0] ); dst[1] = x264_median( a[1], b[1], c[1] ); } static ALWAYS_INLINE int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc ) { int sum = 0; for( int i = 0; i < i_mvc-1; i++ ) { sum += abs( mvc[i][0] - mvc[i+1][0] ) + abs( mvc[i][1] - mvc[i+1][1] ); } return sum; } static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvdtop ) { int amvd0 = mvdleft[0] + mvdtop[0]; int amvd1 = mvdleft[1] + mvdtop[1]; amvd0 = (amvd0 > 2) + (amvd0 > 32); amvd1 = (amvd1 > 2) + (amvd1 > 32); return amvd0 + (amvd1<<8); } /**************************************************************************** * General functions ****************************************************************************/ X264_API void x264_reduce_fraction( uint32_t *n, uint32_t *d ); X264_API void x264_reduce_fraction64( uint64_t *n, uint64_t *d ); X264_API void x264_log_default( void *p_unused, int i_level, const char *psz_fmt, va_list arg ); X264_API void x264_log_internal( int i_level, const char *psz_fmt, ... ); /* x264_malloc: will do or emulate a memalign * you have to use x264_free for buffers allocated with x264_malloc */ X264_API void *x264_malloc( int64_t ); X264_API void x264_free( void * ); /* x264_slurp_file: malloc space for the whole file and read it */ X264_API char *x264_slurp_file( const char *filename ); /* x264_param_strdup: will do strdup and save returned pointer inside * x264_param_t for later freeing during x264_param_cleanup */ char *x264_param_strdup( x264_param_t *param, const char *src ); /* x264_param2string: return a (malloced) string containing most of * the encoding options */ X264_API char *x264_param2string( x264_param_t *p, int b_res ); /**************************************************************************** * Macros ****************************************************************************/ #define CHECKED_MALLOC( var, size )\ do {\ var = x264_malloc( size );\ if( !var )\ goto fail;\ } while( 0 ) #define CHECKED_MALLOCZERO( var, size )\ do {\ CHECKED_MALLOC( var, size );\ memset( var, 0, size );\ } while( 0 ) #define CHECKED_PARAM_STRDUP( var, param, src )\ do {\ var = x264_param_strdup( param, src );\ if( !var )\ goto fail;\ } while( 0 ) /* Macros for merging multiple allocations into a single large malloc, for improved * use with huge pages. */ /* Needs to be enough to contain any set of buffers that use combined allocations */ #define PREALLOC_BUF_SIZE 1024 #define PREALLOC_INIT\ int prealloc_idx = 0;\ int64_t prealloc_size = 0;\ uint8_t **preallocs[PREALLOC_BUF_SIZE]; #define PREALLOC( var, size )\ do {\ var = (void*)(intptr_t)prealloc_size;\ preallocs[prealloc_idx++] = (uint8_t**)&var;\ prealloc_size += ALIGN((int64_t)(size), NATIVE_ALIGN);\ } while( 0 ) #define PREALLOC_END( ptr )\ do {\ CHECKED_MALLOC( ptr, prealloc_size );\ while( prealloc_idx-- )\ *preallocs[prealloc_idx] = (uint8_t*)((intptr_t)(*preallocs[prealloc_idx]) + (intptr_t)ptr);\ } while( 0 ) #endif x264-master/common/bitstream.c000066400000000000000000000125651502133446700165160ustar00rootroot00000000000000/***************************************************************************** * bitstream.c: bitstream writing ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" static uint8_t *nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end ) { if( src < end ) *dst++ = *src++; if( src < end ) *dst++ = *src++; while( src < end ) { if( src[0] <= 0x03 && !dst[-2] && !dst[-1] ) *dst++ = 0x03; *dst++ = *src++; } return dst; } #if HAVE_MMX #include "x86/bitstream.h" #endif #if HAVE_ARMV6 #include "arm/bitstream.h" #endif #if HAVE_AARCH64 #include "aarch64/bitstream.h" #endif /**************************************************************************** * x264_nal_encode: ****************************************************************************/ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ) { uint8_t *src = nal->p_payload; uint8_t *end = nal->p_payload + nal->i_payload; uint8_t *orig_dst = dst; if( h->param.b_annexb ) { if( nal->b_long_startcode ) *dst++ = 0x00; *dst++ = 0x00; *dst++ = 0x00; *dst++ = 0x01; } else /* save room for size later */ dst += 4; /* nal header */ *dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type; dst = h->bsf.nal_escape( dst, src, end ); int size = dst - orig_dst; /* Apply AVC-Intra padding */ if( h->param.i_avcintra_class ) { int padding = nal->i_payload + nal->i_padding + NALU_OVERHEAD - size; if( padding > 0 ) { memset( dst, 0, padding ); size += padding; } nal->i_padding = X264_MAX( padding, 0 ); } /* Write the size header for mp4/etc */ if( !h->param.b_annexb ) { /* Size doesn't include the size of the header we're writing now. */ int chunk_size = size - 4; orig_dst[0] = (uint8_t)(chunk_size >> 24); orig_dst[1] = (uint8_t)(chunk_size >> 16); orig_dst[2] = (uint8_t)(chunk_size >> 8); orig_dst[3] = (uint8_t)(chunk_size >> 0); } nal->i_payload = size; nal->p_payload = orig_dst; x264_emms(); } void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf ) { memset( pf, 0, sizeof(*pf) ); pf->nal_escape = nal_escape_c; #if HAVE_MMX #if ARCH_X86_64 pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2; pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2; #endif if( cpu&X264_CPU_MMX2 ) pf->nal_escape = x264_nal_escape_mmx2; if( cpu&X264_CPU_SSE2 ) { if( cpu&X264_CPU_SSE2_IS_FAST ) pf->nal_escape = x264_nal_escape_sse2; } #if ARCH_X86_64 if( cpu&X264_CPU_LZCNT ) { pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt; pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt; } if( cpu&X264_CPU_SSSE3 ) { pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3; if( cpu&X264_CPU_LZCNT ) { pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt; } } if( cpu&X264_CPU_AVX2 ) { pf->nal_escape = x264_nal_escape_avx2; pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2; } if( cpu&X264_CPU_AVX512 ) { pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512; pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512; } #endif #endif #if HAVE_ARMV6 if( cpu&X264_CPU_NEON ) pf->nal_escape = x264_nal_escape_neon; #endif #if HAVE_AARCH64 if( cpu&X264_CPU_NEON ) pf->nal_escape = x264_nal_escape_neon; #endif } x264-master/common/bitstream.h000066400000000000000000000213751502133446700165220ustar00rootroot00000000000000/***************************************************************************** * bitstream.h: bitstream writing ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_BS_H #define X264_BS_H typedef struct { uint16_t i_bits; uint8_t i_size; /* Next level table to use */ uint8_t i_next; } vlc_large_t; typedef struct bs_s { uint8_t *p_start; uint8_t *p; uint8_t *p_end; uintptr_t cur_bits; int i_left; /* i_count number of available bits */ int i_bits_encoded; /* RD only */ } bs_t; typedef struct { int32_t last; int32_t mask; ALIGNED_16( dctcoef level[18] ); } x264_run_level_t; typedef struct { uint8_t *(*nal_escape)( uint8_t *dst, uint8_t *src, uint8_t *end ); void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); } x264_bitstream_function_t; #define x264_bitstream_init x264_template(bitstream_init) void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf ); /* A larger level table size theoretically could help a bit at extremely * high bitrates, but the cost in cache is usually too high for it to be * useful. * This size appears to be optimal for QP18 encoding on a Nehalem CPU. * FIXME: Do further testing? */ #define LEVEL_TABLE_SIZE 128 #define x264_level_token x264_template(level_token) extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; /* The longest possible set of zero run codes sums to 25 bits. This leaves * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */ #define x264_run_before x264_template(run_before) extern uint32_t x264_run_before[1<<16]; static inline void bs_init( bs_t *s, void *p_data, int i_data ) { int offset = ((intptr_t)p_data & 3); s->p = s->p_start = (uint8_t*)p_data - offset; s->p_end = (uint8_t*)p_data + i_data; s->i_left = (WORD_SIZE - offset)*8; if( offset ) { s->cur_bits = endian_fix32( M32(s->p) ); s->cur_bits >>= (4-offset)*8; } else s->cur_bits = 0; } static inline int bs_pos( bs_t *s ) { return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left ); } /* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */ static inline void bs_flush( bs_t *s ) { M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) ); s->p += WORD_SIZE - (s->i_left >> 3); s->i_left = WORD_SIZE*8; } /* The inverse of bs_flush: prepare the bitstream to be written to again. */ static inline void bs_realign( bs_t *s ) { int offset = ((intptr_t)s->p & 3); if( offset ) { s->p = (uint8_t*)s->p - offset; s->i_left = (WORD_SIZE - offset)*8; s->cur_bits = endian_fix32( M32(s->p) ); s->cur_bits >>= (4-offset)*8; } } static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits ) { if( WORD_SIZE == 8 ) { s->cur_bits = (s->cur_bits << i_count) | i_bits; s->i_left -= i_count; if( s->i_left <= 32 ) { #if WORDS_BIGENDIAN M32( s->p ) = s->cur_bits >> (32 - s->i_left); #else M32( s->p ) = endian_fix( s->cur_bits << s->i_left ); #endif s->i_left += 32; s->p += 4; } } else { if( i_count < s->i_left ) { s->cur_bits = (s->cur_bits << i_count) | i_bits; s->i_left -= i_count; } else { i_count -= s->i_left; s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count); M32( s->p ) = endian_fix( s->cur_bits ); s->p += 4; s->cur_bits = i_bits; s->i_left = 32 - i_count; } } } /* Special case to eliminate branch in normal bs_write. */ /* Golomb never writes an even-size code, so this is only used in slice headers. */ static inline void bs_write32( bs_t *s, uint32_t i_bits ) { bs_write( s, 16, i_bits >> 16 ); bs_write( s, 16, i_bits ); } static inline void bs_write1( bs_t *s, uint32_t i_bit ) { s->cur_bits <<= 1; s->cur_bits |= i_bit; s->i_left--; if( s->i_left == WORD_SIZE*8-32 ) { M32( s->p ) = endian_fix32( s->cur_bits ); s->p += 4; s->i_left = WORD_SIZE*8; } } static inline void bs_align_0( bs_t *s ) { bs_write( s, s->i_left&7, 0 ); bs_flush( s ); } static inline void bs_align_1( bs_t *s ) { bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 ); bs_flush( s ); } static inline void bs_align_10( bs_t *s ) { if( s->i_left&7 ) bs_write( s, s->i_left&7, 1 << ( (s->i_left&7) - 1 ) ); bs_flush( s ); } /* golomb functions */ static const uint8_t x264_ue_size_tab[256] = { 1, 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, }; static inline void bs_write_ue_big( bs_t *s, unsigned int val ) { int size = 0; int tmp = ++val; if( tmp >= 0x10000 ) { size = 32; tmp >>= 16; } if( tmp >= 0x100 ) { size += 16; tmp >>= 8; } size += x264_ue_size_tab[tmp]; bs_write( s, size>>1, 0 ); bs_write( s, (size>>1)+1, val ); } /* Only works on values under 255. */ static inline void bs_write_ue( bs_t *s, int val ) { bs_write( s, x264_ue_size_tab[val+1], val+1 ); } static inline void bs_write_se( bs_t *s, int val ) { int size = 0; /* Faster than (val <= 0 ? -val*2+1 : val*2) */ /* 4 instructions on x86, 3 on ARM */ int tmp = 1 - val*2; if( tmp < 0 ) tmp = val*2; val = tmp; if( tmp >= 0x100 ) { size = 16; tmp >>= 8; } size += x264_ue_size_tab[tmp]; bs_write( s, size, val ); } static inline void bs_write_te( bs_t *s, int x, int val ) { if( x == 1 ) bs_write1( s, 1^val ); else //if( x > 1 ) bs_write_ue( s, val ); } static inline void bs_rbsp_trailing( bs_t *s ) { bs_write1( s, 1 ); bs_write( s, s->i_left&7, 0 ); } static ALWAYS_INLINE int bs_size_ue( unsigned int val ) { return x264_ue_size_tab[val+1]; } static ALWAYS_INLINE int bs_size_ue_big( unsigned int val ) { if( val < 255 ) return x264_ue_size_tab[val+1]; else return x264_ue_size_tab[(val+1)>>8] + 16; } static ALWAYS_INLINE int bs_size_se( int val ) { int tmp = 1 - val*2; if( tmp < 0 ) tmp = val*2; if( tmp < 256 ) return x264_ue_size_tab[tmp]; else return x264_ue_size_tab[tmp>>8]+16; } static ALWAYS_INLINE int bs_size_te( int x, int val ) { if( x == 1 ) return 1; else //if( x > 1 ) return x264_ue_size_tab[val+1]; } #endif x264-master/common/cabac.c000066400000000000000000000137631502133446700155560ustar00rootroot00000000000000/***************************************************************************** * cabac.c: arithmetic coder ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" static uint8_t cabac_contexts[4][QP_MAX_SPEC+1][1024]; void x264_cabac_init( x264_t *h ) { int ctx_count = CHROMA444 ? 1024 : 460; for( int i = 0; i < 4; i++ ) { const int8_t (*cabac_context_init)[1024][2] = i == 0 ? &x264_cabac_context_init_I : &x264_cabac_context_init_PB[i-1]; for( int qp = 0; qp <= QP_MAX_SPEC; qp++ ) for( int j = 0; j < ctx_count; j++ ) { int state = x264_clip3( (((*cabac_context_init)[j][0] * qp) >> 4) + (*cabac_context_init)[j][1], 1, 126 ); cabac_contexts[i][qp][j] = (X264_MIN( state, 127-state ) << 1) | (state >> 6); } } } void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model ) { memcpy( cb->state, cabac_contexts[i_slice_type == SLICE_TYPE_I ? 0 : i_model + 1][i_qp], CHROMA444 ? 1024 : 460 ); } void x264_cabac_encode_init_core( x264_cabac_t *cb ) { cb->i_low = 0; cb->i_range = 0x01FE; cb->i_queue = -9; // the first bit will be shifted away and not written cb->i_bytes_outstanding = 0; } void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end ) { x264_cabac_encode_init_core( cb ); cb->p_start = p_data; cb->p = p_data; cb->p_end = p_end; } static inline void cabac_putbyte( x264_cabac_t *cb ) { if( cb->i_queue >= 0 ) { int out = cb->i_low >> (cb->i_queue+10); cb->i_low &= (0x400<i_queue)-1; cb->i_queue -= 8; if( (out & 0xff) == 0xff ) cb->i_bytes_outstanding++; else { int carry = out >> 8; int bytes_outstanding = cb->i_bytes_outstanding; // this can't modify before the beginning of the stream because // that would correspond to a probability > 1. // it will write before the beginning of the stream, which is ok // because a slice header always comes before cabac data. // this can't carry beyond the one byte, because any 0xff bytes // are in bytes_outstanding and thus not written yet. cb->p[-1] += carry; while( bytes_outstanding > 0 ) { *(cb->p++) = (uint8_t)(carry-1); bytes_outstanding--; } *(cb->p++) = (uint8_t)out; cb->i_bytes_outstanding = 0; } } } static inline void cabac_encode_renorm( x264_cabac_t *cb ) { int shift = x264_cabac_renorm_shift[cb->i_range>>3]; cb->i_range <<= shift; cb->i_low <<= shift; cb->i_queue += shift; cabac_putbyte( cb ); } /* Making custom versions of this function, even in asm, for the cases where * b is known to be 0 or 1, proved to be somewhat useful on x86_32 with GCC 3.4 * but nearly useless with GCC 4.3 and worse than useless on x86_64. */ void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b ) { int i_state = cb->state[i_ctx]; int i_range_lps = x264_cabac_range_lps[i_state>>1][(cb->i_range>>6)-4]; cb->i_range -= i_range_lps; if( b != (i_state & 1) ) { cb->i_low += cb->i_range; cb->i_range = i_range_lps; } cb->state[i_ctx] = x264_cabac_transition[i_state][b]; cabac_encode_renorm( cb ); } /* Note: b is negated for this function */ void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b ) { cb->i_low <<= 1; cb->i_low += b & cb->i_range; cb->i_queue += 1; cabac_putbyte( cb ); } static const int bypass_lut[16] = { -1, 0x2, 0x14, 0x68, 0x1d0, 0x7a0, 0x1f40, 0x7e80, 0x1fd00, 0x7fa00, 0x1ff400, 0x7fe800, 0x1ffd000, 0x7ffa000, 0x1fff4000, 0x7ffe8000 }; void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val ) { uint32_t v = val + (1<i_low <<= i; cb->i_low += ((x>>k)&0xff) * cb->i_range; cb->i_queue += i; cabac_putbyte( cb ); i = 8; } while( k > 0 ); } void x264_cabac_encode_terminal_c( x264_cabac_t *cb ) { cb->i_range -= 2; cabac_encode_renorm( cb ); } void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb ) { cb->i_low += cb->i_range - 2; cb->i_low |= 1; cb->i_low <<= 9; cb->i_queue += 9; cabac_putbyte( cb ); cabac_putbyte( cb ); cb->i_low <<= -cb->i_queue; cb->i_low |= (0x35a4e4f5 >> (h->i_frame & 31) & 1) << 10; cb->i_queue = 0; cabac_putbyte( cb ); while( cb->i_bytes_outstanding > 0 ) { *(cb->p++) = 0xff; cb->i_bytes_outstanding--; } } x264-master/common/cabac.h000066400000000000000000000117761502133446700155650ustar00rootroot00000000000000/***************************************************************************** * cabac.h: arithmetic coder ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_CABAC_H #define X264_CABAC_H typedef struct { /* state */ int i_low; int i_range; /* bit stream */ int i_queue; //stored with an offset of -8 for faster asm int i_bytes_outstanding; uint8_t *p_start; uint8_t *p; uint8_t *p_end; /* aligned for memcpy_aligned starting here */ ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision() /* context */ uint8_t state[1024]; /* for 16-byte alignment */ uint8_t padding[12]; } x264_cabac_t; /* init the contexts given i_slice_type, the quantif and the model */ #define x264_cabac_context_init x264_template(cabac_context_init) void x264_cabac_context_init( x264_t *h, x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model ); #define x264_cabac_encode_init_core x264_template(cabac_encode_init_core) void x264_cabac_encode_init_core( x264_cabac_t *cb ); #define x264_cabac_encode_init x264_template(cabac_encode_init) void x264_cabac_encode_init( x264_cabac_t *cb, uint8_t *p_data, uint8_t *p_end ); #define x264_cabac_encode_decision_c x264_template(cabac_encode_decision_c) void x264_cabac_encode_decision_c( x264_cabac_t *cb, int i_ctx, int b ); #define x264_cabac_encode_decision_asm x264_template(cabac_encode_decision_asm) void x264_cabac_encode_decision_asm( x264_cabac_t *cb, int i_ctx, int b ); #define x264_cabac_encode_bypass_c x264_template(cabac_encode_bypass_c) void x264_cabac_encode_bypass_c( x264_cabac_t *cb, int b ); #define x264_cabac_encode_bypass_asm x264_template(cabac_encode_bypass_asm) void x264_cabac_encode_bypass_asm( x264_cabac_t *cb, int b ); #define x264_cabac_encode_terminal_c x264_template(cabac_encode_terminal_c) void x264_cabac_encode_terminal_c( x264_cabac_t *cb ); #define x264_cabac_encode_terminal_asm x264_template(cabac_encode_terminal_asm) void x264_cabac_encode_terminal_asm( x264_cabac_t *cb ); #define x264_cabac_encode_ue_bypass x264_template(cabac_encode_ue_bypass) void x264_cabac_encode_ue_bypass( x264_cabac_t *cb, int exp_bits, int val ); #define x264_cabac_encode_flush x264_template(cabac_encode_flush) void x264_cabac_encode_flush( x264_t *h, x264_cabac_t *cb ); #if HAVE_MMX #define x264_cabac_encode_decision x264_cabac_encode_decision_asm #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm #elif HAVE_AARCH64 #define x264_cabac_encode_decision x264_cabac_encode_decision_asm #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm #else #define x264_cabac_encode_decision x264_cabac_encode_decision_c #define x264_cabac_encode_bypass x264_cabac_encode_bypass_c #define x264_cabac_encode_terminal x264_cabac_encode_terminal_c #endif #define x264_cabac_encode_decision_noup x264_cabac_encode_decision static ALWAYS_INLINE int x264_cabac_pos( x264_cabac_t *cb ) { return (cb->p - cb->p_start + cb->i_bytes_outstanding) * 8 + cb->i_queue; } /* internal only. these don't write the bitstream, just calculate bit cost: */ static ALWAYS_INLINE void x264_cabac_size_decision( x264_cabac_t *cb, long i_ctx, long b ) { int i_state = cb->state[i_ctx]; cb->state[i_ctx] = x264_cabac_transition[i_state][b]; cb->f8_bits_encoded += x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE int x264_cabac_size_decision2( uint8_t *state, long b ) { int i_state = *state; *state = x264_cabac_transition[i_state][b]; return x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE void x264_cabac_size_decision_noup( x264_cabac_t *cb, long i_ctx, long b ) { int i_state = cb->state[i_ctx]; cb->f8_bits_encoded += x264_cabac_entropy[i_state^b]; } static ALWAYS_INLINE int x264_cabac_size_decision_noup2( uint8_t *state, long b ) { return x264_cabac_entropy[*state^b]; } #endif x264-master/common/common.c000066400000000000000000000034541502133446700160110ustar00rootroot00000000000000/***************************************************************************** * common.c: misc common functions ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" /**************************************************************************** * x264_log: ****************************************************************************/ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... ) { if( !h || i_level <= h->param.i_log_level ) { va_list arg; va_start( arg, psz_fmt ); if( !h ) x264_log_default( NULL, i_level, psz_fmt, arg ); else h->param.pf_log( h->param.p_log_private, i_level, psz_fmt, arg ); va_end( arg ); } } x264-master/common/common.h000066400000000000000000000703501502133446700160150ustar00rootroot00000000000000/***************************************************************************** * common.h: misc common functions ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_COMMON_H #define X264_COMMON_H #include "base.h" /* Macros for templating function calls according to bit depth */ #define x264_template(w) x264_glue3(x264, BIT_DEPTH, w) /**************************************************************************** * API Templates ****************************************************************************/ #define x264_nal_encode x264_template(nal_encode) #define x264_encoder_reconfig x264_template(encoder_reconfig) #define x264_encoder_parameters x264_template(encoder_parameters) #define x264_encoder_headers x264_template(encoder_headers) #define x264_encoder_encode x264_template(encoder_encode) #define x264_encoder_close x264_template(encoder_close) #define x264_encoder_delayed_frames x264_template(encoder_delayed_frames) #define x264_encoder_maximum_delayed_frames x264_template(encoder_maximum_delayed_frames) #define x264_encoder_intra_refresh x264_template(encoder_intra_refresh) #define x264_encoder_invalidate_reference x264_template(encoder_invalidate_reference) /* This undef allows to rename the external symbol and force link failure in case * of incompatible libraries. Then the define enables templating as above. */ #undef x264_encoder_open #define x264_encoder_open x264_template(encoder_open) /**************************************************************************** * Macros ****************************************************************************/ #define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16) #define QP_BD_OFFSET (6*(BIT_DEPTH-8)) #define QP_MAX_SPEC (51+QP_BD_OFFSET) #define QP_MAX (QP_MAX_SPEC+18) #define PIXEL_MAX ((1 << BIT_DEPTH)-1) // arbitrary, but low because SATD scores are 1/4 normal #define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET) #define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC) #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame #define FILLER_OVERHEAD (NALU_OVERHEAD+1) #define SEI_OVERHEAD (NALU_OVERHEAD - (h->param.b_annexb && !h->param.i_avcintra_class && (h->out.i_nal-1))) #if HAVE_INTERLACED # define MB_INTERLACED h->mb.b_interlaced # define SLICE_MBAFF h->sh.b_mbaff # define PARAM_INTERLACED h->param.b_interlaced #else # define MB_INTERLACED 0 # define SLICE_MBAFF 0 # define PARAM_INTERLACED 0 #endif #ifdef CHROMA_FORMAT # define CHROMA_H_SHIFT (CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422) # define CHROMA_V_SHIFT (CHROMA_FORMAT == CHROMA_420) #else # define CHROMA_FORMAT h->sps->i_chroma_format_idc # define CHROMA_H_SHIFT h->mb.chroma_h_shift # define CHROMA_V_SHIFT h->mb.chroma_v_shift #endif #define CHROMA_SIZE(s) (CHROMA_FORMAT ? (s)>>(CHROMA_H_SHIFT+CHROMA_V_SHIFT) : 0) #define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s)) #define CHROMA444 (CHROMA_FORMAT == CHROMA_444) #if HIGH_BIT_DEPTH typedef uint16_t pixel; typedef uint64_t pixel4; typedef int32_t dctcoef; typedef uint32_t udctcoef; # define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL) # define MPIXEL_X4(src) M64(src) #else typedef uint8_t pixel; typedef uint32_t pixel4; typedef int16_t dctcoef; typedef uint16_t udctcoef; # define PIXEL_SPLAT_X4(x) ((x)*0x01010101U) # define MPIXEL_X4(src) M32(src) #endif #define SIZEOF_PIXEL ((int)sizeof(pixel)) #define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src) /**************************************************************************** * Includes ****************************************************************************/ #if HAVE_OPENCL #include "opencl.h" #endif #include "cabac.h" #include "bitstream.h" #include "set.h" #include "predict.h" #include "pixel.h" #include "mc.h" #include "frame.h" #include "dct.h" #include "quant.h" #include "threadpool.h" /**************************************************************************** * General functions ****************************************************************************/ /* log */ #define x264_log x264_template(log) void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... ); #define x264_cavlc_init x264_template(cavlc_init) void x264_cavlc_init( x264_t *h ); #define x264_cabac_init x264_template(cabac_init) void x264_cabac_init( x264_t *h ); static ALWAYS_INLINE pixel x264_clip_pixel( int x ) { return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x ); } /**************************************************************************** * ****************************************************************************/ typedef struct { x264_sps_t *sps; x264_pps_t *pps; int i_type; int i_first_mb; int i_last_mb; int i_pps_id; int i_frame_num; int b_mbaff; int b_field_pic; int b_bottom_field; int i_idr_pic_id; /* -1 if nal_type != 5 */ int i_poc; int i_delta_poc_bottom; int i_delta_poc[2]; int i_redundant_pic_cnt; int b_direct_spatial_mv_pred; int b_num_ref_idx_override; int i_num_ref_idx_l0_active; int i_num_ref_idx_l1_active; int b_ref_pic_list_reordering[2]; struct { int idc; int arg; } ref_pic_list_order[2][X264_REF_MAX]; /* P-frame weighting */ int b_weighted_pred; x264_weight_t weight[X264_REF_MAX*2][3]; int i_mmco_remove_from_end; int i_mmco_command_count; struct /* struct for future expansion */ { int i_difference_of_pic_nums; int i_poc; } mmco[X264_REF_MAX]; int i_cabac_init_idc; int i_qp; int i_qp_delta; int b_sp_for_swidth; int i_qs_delta; /* deblocking filter */ int i_disable_deblocking_filter_idc; int i_alpha_c0_offset; int i_beta_offset; } x264_slice_header_t; typedef struct x264_lookahead_t { volatile uint8_t b_exit_thread; uint8_t b_thread_active; uint8_t b_analyse_keyframe; int i_last_keyframe; int i_slicetype_length; x264_frame_t *last_nonb; x264_pthread_t thread_handle; x264_sync_frame_list_t ifbuf; x264_sync_frame_list_t next; x264_sync_frame_list_t ofbuf; } x264_lookahead_t; typedef struct x264_ratecontrol_t x264_ratecontrol_t; typedef struct x264_left_table_t { uint8_t intra[4]; uint8_t nnz[4]; uint8_t nnz_chroma[4]; uint8_t mv[4]; uint8_t ref[4]; } x264_left_table_t; /* Current frame stats */ typedef struct { /* MV bits (MV+Ref+Block Type) */ int i_mv_bits; /* Texture bits (DCT coefs) */ int i_tex_bits; /* ? */ int i_misc_bits; /* MB type counts */ int i_mb_count[19]; int i_mb_count_i; int i_mb_count_p; int i_mb_count_skip; int i_mb_count_8x8dct[2]; int i_mb_count_ref[2][X264_REF_MAX*2]; int i_mb_partition[17]; int i_mb_cbp[6]; int i_mb_pred_mode[4][13]; int i_mb_field[3]; /* Adaptive direct mv pred */ int i_direct_score[2]; /* Metrics */ int64_t i_ssd[3]; double f_ssim; int i_ssim_cnt; } x264_frame_stat_t; struct x264_t { /* encoder parameters */ x264_param_t param; /* opaque pointer to bit depth independent interface */ void *api; x264_t *thread[X264_THREAD_MAX+1]; x264_t *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX]; int b_thread_active; int i_thread_phase; /* which thread to use for the next frame */ int i_thread_idx; /* which thread this is */ int i_threadslice_start; /* first row in this thread slice */ int i_threadslice_end; /* row after the end of this thread slice */ int i_threadslice_pass; /* which pass of encoding we are on */ x264_threadpool_t *threadpool; x264_threadpool_t *lookaheadpool; x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; /* bitstream output */ struct { int i_nal; int i_nals_allocated; x264_nal_t *nal; int i_bitstream; /* size of p_bitstream */ uint8_t *p_bitstream; /* will hold data for all nal */ bs_t bs; } out; uint8_t *nal_buffer; int nal_buffer_size; x264_t *reconfig_h; int reconfig; /**** thread synchronization starts here ****/ /* frame number/poc */ int i_frame; int i_frame_num; int i_thread_frames; /* Number of different frames being encoded by threads; * 1 when sliced-threads is on. */ int i_nal_type; int i_nal_ref_idc; int64_t i_disp_fields; /* Number of displayed fields (both coded and implied via pic_struct) */ int i_disp_fields_last_frame; int64_t i_prev_duration; /* Duration of previous frame */ int64_t i_coded_fields; /* Number of coded fields (both coded and implied via pic_struct) */ int64_t i_cpb_delay; /* Equal to number of fields preceding this field * since last buffering_period SEI */ int64_t i_coded_fields_lookahead; /* Use separate counters for lookahead */ int64_t i_cpb_delay_lookahead; int64_t i_cpb_delay_pir_offset; int64_t i_cpb_delay_pir_offset_next; int b_queued_intra_refresh; int64_t i_last_idr_pts; int i_idr_pic_id; /* quantization matrix for decoding, [cqm][qp%6][coef] */ int (*dequant4_mf[4])[16]; /* [4][6][16] */ int (*dequant8_mf[4])[64]; /* [4][6][64] */ /* quantization matrix for trellis, [cqm][qp][coef] */ int (*unquant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ int (*unquant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ /* quantization matrix for deadzone */ udctcoef (*quant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ udctcoef (*quant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ udctcoef (*quant4_bias[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ udctcoef (*quant8_bias[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ udctcoef (*quant4_bias0[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ udctcoef (*quant8_bias0[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ udctcoef (*nr_offset_emergency)[4][64]; /* mv/ref/mode cost arrays. */ uint16_t *cost_mv[QP_MAX+1]; uint16_t *cost_mv_fpel[QP_MAX+1][4]; struct { uint16_t ref[QP_MAX+1][3][33]; uint16_t i4x4_mode[QP_MAX+1][17]; } *cost_table; const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */ /* Slice header */ x264_slice_header_t sh; /* SPS / PPS */ x264_sps_t sps[1]; x264_pps_t pps[1]; /* Slice header backup, for SEI_DEC_REF_PIC_MARKING */ int b_sh_backup; x264_slice_header_t sh_backup; /* cabac context */ x264_cabac_t cabac; struct { /* Frames to be encoded (whose types have been decided) */ x264_frame_t **current; /* Unused frames: 0 = fenc, 1 = fdec */ x264_frame_t **unused[2]; /* Unused blank frames (for duplicates) */ x264_frame_t **blank_unused; /* frames used for reference + sentinels */ x264_frame_t *reference[X264_REF_MAX+2]; int i_last_keyframe; /* Frame number of the last keyframe */ int i_last_idr; /* Frame number of the last IDR (not RP)*/ int i_poc_last_open_gop; /* Poc of the I frame of the last open-gop. The value * is only assigned during the period between that * I frame and the next P or I frame, else -1 */ int i_input; /* Number of input frames already accepted */ int i_max_dpb; /* Number of frames allocated in the decoded picture buffer */ int i_max_ref0; int i_max_ref1; int i_delay; /* Number of frames buffered for B reordering */ int i_bframe_delay; int64_t i_bframe_delay_time; int64_t i_first_pts; int64_t i_prev_reordered_pts[2]; int64_t i_largest_pts; int64_t i_second_largest_pts; int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */ int b_have_sub8x8_esa; } frames; /* current frame being encoded */ x264_frame_t *fenc; /* frame being reconstructed */ x264_frame_t *fdec; /* references lists */ int i_ref[2]; x264_frame_t *fref[2][X264_REF_MAX+3]; x264_frame_t *fref_nearest[2]; int b_ref_reorder[2]; /* hrd */ int initial_cpb_removal_delay; int initial_cpb_removal_delay_offset; int64_t i_reordered_pts_delay; /* Current MB DCT coeffs */ struct { ALIGNED_64( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? ALIGNED_64( dctcoef luma8x8[12][64] ); ALIGNED_64( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ struct { int i_mb_width; int i_mb_height; int i_mb_count; /* number of mbs in a frame */ /* Chroma subsampling */ int chroma_h_shift; int chroma_v_shift; /* Strides */ int i_mb_stride; int i_b8_stride; int i_b4_stride; int left_b8[2]; int left_b4[2]; /* Current index */ int i_mb_x; int i_mb_y; int i_mb_xy; int i_b8_xy; int i_b4_xy; /* Search parameters */ int i_me_method; int i_subpel_refine; int b_chroma_me; int b_trellis; int b_noise_reduction; int b_dct_decimate; int i_psy_rd; /* Psy RD strength--fixed point value*/ int i_psy_trellis; /* Psy trellis strength--fixed point value*/ int b_interlaced; int b_adaptive_mbaff; /* MBAFF+subme 0 requires non-adaptive MBAFF i.e. all field mbs */ /* Allowed qpel MV range to stay within the picture + emulated edge pixels */ int mv_min[2]; int mv_max[2]; int mv_miny_row[3]; /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */ int mv_maxy_row[3]; /* Subpel MV range for motion search. * same mv_min/max but includes levels' i_mv_range. */ int mv_min_spel[2]; int mv_max_spel[2]; int mv_miny_spel_row[3]; int mv_maxy_spel_row[3]; /* Fullpel MV range for motion search */ ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */ int mv_miny_fpel_row[3]; int mv_maxy_fpel_row[3]; /* neighboring MBs */ unsigned int i_neighbour; unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */ unsigned int i_neighbour4[16]; /* at the time the block is coded */ unsigned int i_neighbour_intra; /* for constrained intra pred */ unsigned int i_neighbour_frame; /* ignoring slice boundaries */ int i_mb_type_top; int i_mb_type_left[2]; int i_mb_type_topleft; int i_mb_type_topright; int i_mb_prev_xy; int i_mb_left_xy[2]; int i_mb_top_xy; int i_mb_topleft_xy; int i_mb_topright_xy; int i_mb_top_y; int i_mb_topleft_y; int i_mb_topright_y; const x264_left_table_t *left_index_table; int i_mb_top_mbpair_xy; int topleft_partition; int b_allow_skip; int field_decoding_flag; /**** thread synchronization ends here ****/ /* subsequent variables are either thread-local or constant, * and won't be copied from one thread to another */ /* mb table */ uint8_t *base; /* base pointer for all malloced data in this mb */ int8_t *type; /* mb type */ uint8_t *partition; /* mb partition */ int8_t *qp; /* mb qp */ int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */ int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */ /* actually has only 7 entries; set to 8 for write-combining optimizations */ uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */ int8_t *chroma_pred_mode; /* chroma_pred_mode. cabac only. for non intra I_PRED_CHROMA_DC(0) */ int16_t (*mv[2])[2]; /* mb mv. set to 0 for intra mb */ uint8_t (*mvd[2])[8][2]; /* absolute value of mb mv difference with predict, clipped to [0,33]. set to 0 if intra. cabac only */ int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */ int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */ int32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */ uint8_t *field; /* buffer for weighted versions of the reference frames */ pixel *p_weight_buf[X264_REF_MAX]; /* current value */ int i_type; int i_partition; ALIGNED_4( uint8_t i_sub_partition[4] ); int b_transform_8x8; int i_cbp_luma; int i_cbp_chroma; int i_intra16x16_pred_mode; int i_chroma_pred_mode; /* skip flags for i4x4 and i8x8 * 0 = encode as normal. * 1 (non-RD only) = the DCT is still in h->dct, restore fdec and skip reconstruction. * 2 (RD only) = the DCT has since been overwritten by RD; restore that too. */ int i_skip_intra; /* skip flag for motion compensation */ /* if we've already done MC, we don't need to do it again */ int b_skip_mc; /* set to true if we are re-encoding a macroblock. */ int b_reencode_mb; int ip_offset; /* Used by PIR to offset the quantizer of intra-refresh blocks. */ int b_deblock_rdo; int b_overflow; /* If CAVLC had a level code overflow during bitstream writing. */ struct { /* space for p_fenc and p_fdec */ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] ); ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ ALIGNED_32( pixel i4x4_fdec_buf[16*16] ); ALIGNED_32( pixel i8x8_fdec_buf[16*16] ); ALIGNED_64( dctcoef i8x8_dct_buf[3][64] ); ALIGNED_64( dctcoef i4x4_dct_buf[15][16] ); uint32_t i4x4_nnz_buf[4]; uint32_t i8x8_nnz_buf[4]; /* Psy trellis DCT data */ ALIGNED_64( dctcoef fenc_dct8[4][64] ); ALIGNED_64( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ ALIGNED_64( uint32_t fenc_satd_cache[32] ); ALIGNED_16( uint64_t fenc_hadamard_cache[9] ); int i4x4_cbp; int i8x8_cbp; /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ /* pointer to the actual source frame, not a block copy */ pixel *p_fenc_plane[3]; /* pointer over mb of the frame to be reconstructed */ pixel *p_fdec[3]; /* pointer over mb of the references */ int i_fref[2]; /* [12]: yN, yH, yV, yHV, (NV12 ? uv : I444 ? (uN, uH, uV, uHV, vN, ...)) */ pixel *p_fref[2][X264_REF_MAX*2][12]; pixel *p_fref_w[X264_REF_MAX*2]; /* weighted fullpel luma */ uint16_t *p_integral[2][X264_REF_MAX]; /* fref stride */ int i_stride[3]; } pic; /* cache */ struct { /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */ ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); /* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */ ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] ); /* -1 if unused, -2 if unavailable */ ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); /* 0 if not available */ ALIGNED_16( int16_t mv[2][X264_SCAN8_LUMA_SIZE][2] ); ALIGNED_8( uint8_t mvd[2][X264_SCAN8_LUMA_SIZE][2] ); /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */ ALIGNED_4( int8_t skip[X264_SCAN8_LUMA_SIZE] ); ALIGNED_4( int16_t direct_mv[2][4][2] ); ALIGNED_4( int8_t direct_ref[2][4] ); int direct_partition; ALIGNED_4( int16_t pskip_mv[2] ); /* number of neighbors (top and left) that used 8x8 dct */ int i_neighbour_transform_size; int i_neighbour_skip; /* neighbor CBPs */ int i_cbp_top; int i_cbp_left; /* extra data required for mbaff in mv prediction */ int16_t topright_mv[2][3][2]; int8_t topright_ref[2][3]; /* current mb deblock strength */ uint8_t (*deblock_strength)[8][4]; } cache; /* */ int i_qp; /* current qp */ int i_chroma_qp; int i_last_qp; /* last qp */ int i_last_dqp; /* last delta qp */ int b_variable_qp; /* whether qp is allowed to vary per macroblock */ int b_lossless; int b_direct_auto_read; /* take stats for --direct auto from the 2pass log */ int b_direct_auto_write; /* analyse direct modes, to use and/or save */ /* lambda values */ int i_trellis_lambda2[2][2]; /* [luma,chroma][inter,intra] */ int i_psy_rd_lambda; int i_chroma_lambda2_offset; /* B_direct and weighted prediction */ int16_t dist_scale_factor_buf[2][2][X264_REF_MAX*2][4]; int16_t (*dist_scale_factor)[4]; int8_t bipred_weight_buf[2][2][X264_REF_MAX*2][4]; int8_t (*bipred_weight)[4]; /* maps fref1[0]'s ref indices into the current list0 */ #define map_col_to_list0(col) h->mb.map_col_to_list0[(col)+2] int8_t map_col_to_list0[X264_REF_MAX+2]; int ref_blind_dupe; /* The index of the blind reference frame duplicate. */ int8_t deblock_ref_table[X264_REF_MAX*2+2]; #define deblock_ref_table(x) h->mb.deblock_ref_table[(x)+2] } mb; /* rate control encoding only */ x264_ratecontrol_t *rc; /* stats */ struct { /* Cumulated stats */ /* per slice info */ int i_frame_count[3]; int64_t i_frame_size[3]; double f_frame_qp[3]; int i_consecutive_bframes[X264_BFRAME_MAX+1]; /* */ double f_ssd_global[3]; double f_psnr_average[3]; double f_psnr_mean_y[3]; double f_psnr_mean_u[3]; double f_psnr_mean_v[3]; double f_ssim_mean_y[3]; double f_frame_duration[3]; /* */ int64_t i_mb_count[3][19]; int64_t i_mb_partition[2][17]; int64_t i_mb_count_8x8dct[2]; int64_t i_mb_count_ref[2][2][X264_REF_MAX*2]; int64_t i_mb_cbp[6]; int64_t i_mb_pred_mode[4][13]; int64_t i_mb_field[3]; /* */ int i_direct_score[2]; int i_direct_frames[2]; /* num p-frames weighted */ int i_wpred[2]; /* Current frame stats */ x264_frame_stat_t frame; } stat; /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4, 3 = chroma 8x8 */ udctcoef (*nr_offset)[64]; uint32_t (*nr_residual_sum)[64]; uint32_t *nr_count; ALIGNED_32( udctcoef nr_offset_denoise[4][64] ); ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] ); uint32_t nr_count_buf[2][4]; uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */ /* Buffers that are allocated per-thread even in sliced threads. */ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */ void *scratch_buffer2; /* if the first one's already in use */ pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ /* Deblock strength values are stored for each 4x4 partition. In MBAFF * there are four extra values that need to be stored, located in [4][i]. */ uint8_t (*deblock_strength[2])[2][8][4]; /* CPU functions dependents */ x264_predict_t predict_16x16[4+3]; x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; x264_predict_t predict_chroma[4+3]; x264_predict_t predict_8x8c[4+3]; x264_predict_t predict_8x16c[4+3]; x264_predict_8x8_filter_t predict_8x8_filter; x264_pixel_function_t pixf; x264_mc_functions_t mc; x264_dct_function_t dctf; x264_zigzag_function_t zigzagf; x264_zigzag_function_t zigzagf_interlaced; x264_zigzag_function_t zigzagf_progressive; x264_quant_function_t quantf; x264_deblock_function_t loopf; x264_bitstream_function_t bsf; x264_lookahead_t *lookahead; #if HAVE_OPENCL x264_opencl_t opencl; #endif }; typedef struct { int sad; int16_t mv[2]; } mvsad_t; // included at the end because it needs x264_t #include "macroblock.h" static ALWAYS_INLINE int x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { int cnt = 0; for( int i = 0; i < i_mvc; i++ ) { int mx = (mvc[i][0] + 2) >> 2; int my = (mvc[i][1] + 2) >> 2; uint32_t mv = pack16to32_mask(mx, my); if( !mv || mv == pmv ) continue; dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] ); dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] ); cnt++; } return cnt; } static ALWAYS_INLINE int x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { int cnt = 0; int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2}; for( int i = 0; i < i_mvc; i++ ) { uint32_t mv = M32( mvc[i] ); int mx = mvc[i][0]; int my = mvc[i][1]; if( !mv || mv == pmv ) continue; dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] ); dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] ); cnt++; } return cnt; } #if ARCH_X86 || ARCH_X86_64 #include "x86/util.h" #endif #include "rectangle.h" #endif x264-master/common/cpu.c000066400000000000000000000450421502133446700153070ustar00rootroot00000000000000/***************************************************************************** * cpu.c: cpu detection ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "base.h" #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #include #endif #if HAVE_SYSCONF #include #endif #if SYS_LINUX #include #endif #if SYS_BEOS #include #endif #if SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD #include #include #endif #if SYS_OPENBSD #include #endif const x264_cpu_name_t x264_cpu_names[] = { #if ARCH_X86 || ARCH_X86_64 // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore #define MMX2 X264_CPU_MMX|X264_CPU_MMX2 {"MMX2", MMX2}, {"MMXEXT", MMX2}, {"SSE", MMX2|X264_CPU_SSE}, #define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2 {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", SSE2}, {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, {"LZCNT", SSE2|X264_CPU_LZCNT}, {"SSE3", SSE2|X264_CPU_SSE3}, {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, #define AVX SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX {"AVX", AVX}, {"XOP", AVX|X264_CPU_XOP}, {"FMA4", AVX|X264_CPU_FMA4}, {"FMA3", AVX|X264_CPU_FMA3}, {"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1}, {"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2}, #define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2 {"AVX2", AVX2}, {"AVX512", AVX2|X264_CPU_AVX512}, #undef AVX2 #undef AVX #undef SSE2 #undef MMX2 {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"SlowAtom", X264_CPU_SLOW_ATOM}, {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, {"SlowShuffle", X264_CPU_SLOW_SHUFFLE}, {"UnalignedStack", X264_CPU_STACK_MOD4}, #elif ARCH_PPC {"Altivec", X264_CPU_ALTIVEC}, #elif ARCH_ARM {"ARMv6", X264_CPU_ARMV6}, {"NEON", X264_CPU_NEON}, {"FastNeonMRC", X264_CPU_FAST_NEON_MRC}, #elif ARCH_AARCH64 {"ARMv8", X264_CPU_ARMV8}, {"NEON", X264_CPU_NEON}, {"DotProd", X264_CPU_DOTPROD}, {"I8MM", X264_CPU_I8MM}, {"SVE", X264_CPU_SVE}, {"SVE2", X264_CPU_SVE2}, #elif ARCH_MIPS {"MSA", X264_CPU_MSA}, #elif ARCH_LOONGARCH {"LSX", X264_CPU_LSX}, {"LASX", X264_CPU_LASX}, #endif {"", 0}, }; static unsigned long x264_getauxval( unsigned long type ) { #if HAVE_GETAUXVAL return getauxval( type ); #elif HAVE_ELF_AUX_INFO unsigned long aux = 0; elf_aux_info( type, &aux, sizeof(aux) ); return aux; #else return 0; #endif } #if ((HAVE_ALTIVEC && SYS_LINUX) || (HAVE_ARMV6 && !HAVE_NEON)) && !(HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO) #include #include static sigjmp_buf jmpbuf; static volatile sig_atomic_t canjump = 0; static void sigill_handler( int sig ) { if( !canjump ) { signal( sig, SIG_DFL ); raise( sig ); } canjump = 0; siglongjmp( jmpbuf, 1 ); } #endif #if HAVE_MMX int x264_cpu_cpuid_test( void ); void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); uint64_t x264_cpu_xgetbv( int xcr ); uint32_t x264_cpu_detect( void ) { uint32_t cpu = 0; uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = {0}; uint32_t max_extended_cap, max_basic_cap; #if !ARCH_X86_64 if( !x264_cpu_cpuid_test() ) return 0; #endif x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 ); if( max_basic_cap == 0 ) return 0; x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); if( edx&0x00800000 ) cpu |= X264_CPU_MMX; else return cpu; if( edx&0x02000000 ) cpu |= X264_CPU_MMX2|X264_CPU_SSE; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) cpu |= X264_CPU_SSE3; if( ecx&0x00000200 ) cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST; if( ecx&0x00080000 ) cpu |= X264_CPU_SSE4; if( ecx&0x00100000 ) cpu |= X264_CPU_SSE42; if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */ { uint64_t xcr0 = x264_cpu_xgetbv( 0 ); if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */ { if( ecx&0x10000000 ) cpu |= X264_CPU_AVX; if( ecx&0x00001000 ) cpu |= X264_CPU_FMA3; if( max_basic_cap >= 7 ) { x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); if( ebx&0x00000008 ) cpu |= X264_CPU_BMI1; if( ebx&0x00000100 ) cpu |= X264_CPU_BMI2; if( ebx&0x00000020 ) cpu |= X264_CPU_AVX2; if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */ { if( (ebx&0xD0030000) == 0xD0030000 ) cpu |= X264_CPU_AVX512; } } } } x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; if( max_extended_cap >= 0x80000001 ) { x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx ); if( ecx&0x00000020 ) cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */ if( ecx&0x00000040 ) /* SSE4a, AMD only */ { int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */ if( family == 0x14 ) { cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */ cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */ cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */ } if( family == 0x16 ) { cpu |= X264_CPU_SLOW_PSHUFB; /* Jaguar's pshufb isn't that slow, but it's slow enough * compared to alternate instruction sequences that this * is equal or faster on almost all such functions. */ } } if( cpu & X264_CPU_AVX ) { if( ecx&0x00000800 ) /* XOP */ cpu |= X264_CPU_XOP; if( ecx&0x00010000 ) /* FMA4 */ cpu |= X264_CPU_FMA4; } if( !strcmp((char*)vendor, "AuthenticAMD") ) { if( edx&0x00400000 ) cpu |= X264_CPU_MMX2; if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } } if( !strcmp((char*)vendor, "GenuineIntel") ) { x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); int family = ((eax>>8)&0xf) + ((eax>>20)&0xff); int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); if( family == 6 ) { /* Detect Atom CPU */ if( model == 28 ) { cpu |= X264_CPU_SLOW_ATOM; cpu |= X264_CPU_SLOW_PSHUFB; } /* Conroe has a slow shuffle unit. Check the model number to make sure not * to include crippled low-end Penryns and Nehalems that don't have SSE4. */ else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 ) cpu |= X264_CPU_SLOW_SHUFFLE; } } if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42)) { /* cacheline size is specified in 3 places, any of which may be missing */ x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); int cache = (ebx&0xff00)>>5; // cflush size if( !cache && max_extended_cap >= 0x80000006 ) { x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); cache = ecx&0xff; // cacheline size } if( !cache && max_basic_cap >= 2 ) { // Cache and TLB Information static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 }; static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 }; uint32_t buf[4]; int max, i = 0; do { x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 ); max = buf[0]&0xff; buf[0] &= ~0xff; for( int j = 0; j < 4; j++ ) if( !(buf[j]>>31) ) while( buf[j] ) { if( strchr( cache32_ids, buf[j]&0xff ) ) cache = 32; if( strchr( cache64_ids, buf[j]&0xff ) ) cache = 64; buf[j] >>= 8; } } while( ++i < max ); } if( cache == 32 ) cpu |= X264_CPU_CACHELINE_32; else if( cache == 64 ) cpu |= X264_CPU_CACHELINE_64; else x264_log_internal( X264_LOG_WARNING, "unable to determine cacheline size\n" ); } #if STACK_ALIGNMENT < 16 cpu |= X264_CPU_STACK_MOD4; #endif return cpu; } #elif HAVE_ALTIVEC #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO #define HWCAP_PPC_ALTIVEC (1U << 28) uint32_t x264_cpu_detect( void ) { uint32_t flags = 0; unsigned long hwcap = x264_getauxval( AT_HWCAP ); if ( hwcap & HWCAP_PPC_ALTIVEC ) flags |= X264_CPU_ALTIVEC; return flags; } #elif SYS_MACOSX || SYS_FREEBSD || SYS_NETBSD || SYS_OPENBSD uint32_t x264_cpu_detect( void ) { /* Thank you VLC */ uint32_t cpu = 0; #if SYS_OPENBSD int selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC }; #elif SYS_MACOSX int selectors[2] = { CTL_HW, HW_VECTORUNIT }; #endif int has_altivec = 0; size_t length = sizeof( has_altivec ); #if SYS_MACOSX || SYS_OPENBSD int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 ); #elif SYS_NETBSD int error = sysctlbyname( "machdep.altivec", &has_altivec, &length, NULL, 0 ); #else int error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 ); #endif if( error == 0 && has_altivec != 0 ) cpu |= X264_CPU_ALTIVEC; return cpu; } #elif SYS_LINUX uint32_t x264_cpu_detect( void ) { #ifdef __NO_FPRS__ return 0; #else static void (*oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); if( sigsetjmp( jmpbuf, 1 ) ) { signal( SIGILL, oldsig ); return 0; } canjump = 1; asm volatile( "mtspr 256, %0\n\t" "vand 0, 0, 0\n\t" : : "r"(-1) ); canjump = 0; signal( SIGILL, oldsig ); return X264_CPU_ALTIVEC; #endif } #else uint32_t x264_cpu_detect( void ) { return 0; } #endif #elif HAVE_ARMV6 void x264_cpu_neon_test( void ); int x264_cpu_fast_neon_mrc_test( void ); #define HWCAP_ARM_NEON (1U << 12) uint32_t x264_cpu_detect( void ) { uint32_t flags = 0; flags |= X264_CPU_ARMV6; #if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO unsigned long hwcap = x264_getauxval( AT_HWCAP ); if ( hwcap & HWCAP_ARM_NEON ) flags |= X264_CPU_NEON; #else // don't do this hack if compiled with -mfpu=neon #if !HAVE_NEON static void (* oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); if( sigsetjmp( jmpbuf, 1 ) ) { signal( SIGILL, oldsig ); return flags; } canjump = 1; x264_cpu_neon_test(); canjump = 0; signal( SIGILL, oldsig ); #endif flags |= X264_CPU_NEON; #endif // fast neon -> arm (Cortex-A9) detection relies on user access to the // cycle counter; this assumes ARMv7 performance counters. // NEON requires at least ARMv7, ARMv8 may require changes here, but // hopefully this hacky detection method will have been replaced by then. // Note that there is potential for a race condition if another program or // x264 instance disables or reinits the counters while x264 is using them, // which may result in incorrect detection and the counters stuck enabled. // right now Apple does not seem to support performance counters for this test // Don't test this on Windows; performance counters are readable, but // the PMNC is not readable. #if !defined(__MACH__) && !defined(_WIN32) flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0; #endif // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc) return flags; } #elif HAVE_AARCH64 #if defined(__linux__) || HAVE_ELF_AUX_INFO #define HWCAP_AARCH64_ASIMDDP (1U << 20) #define HWCAP_AARCH64_SVE (1U << 22) #define HWCAP2_AARCH64_SVE2 (1U << 1) #define HWCAP2_AARCH64_I8MM (1U << 13) static uint32_t detect_flags( void ) { uint32_t flags = 0; unsigned long hwcap = x264_getauxval( AT_HWCAP ); unsigned long hwcap2 = x264_getauxval( AT_HWCAP2 ); if ( hwcap & HWCAP_AARCH64_ASIMDDP ) flags |= X264_CPU_DOTPROD; if ( hwcap2 & HWCAP2_AARCH64_I8MM ) flags |= X264_CPU_I8MM; if ( hwcap & HWCAP_AARCH64_SVE ) flags |= X264_CPU_SVE; if ( hwcap2 & HWCAP2_AARCH64_SVE2 ) flags |= X264_CPU_SVE2; return flags; } #elif defined(__APPLE__) #include static int have_feature( const char *feature ) { int supported = 0; size_t size = sizeof(supported); if ( sysctlbyname( feature, &supported, &size, NULL, 0 ) ) return 0; return supported; } static uint32_t detect_flags( void ) { uint32_t flags = 0; if ( have_feature( "hw.optional.arm.FEAT_DotProd" ) ) flags |= X264_CPU_DOTPROD; if ( have_feature( "hw.optional.arm.FEAT_I8MM" ) ) flags |= X264_CPU_I8MM; /* No SVE and SVE2 feature detection available on Apple platforms. */ return flags; } #elif defined(_WIN32) #include static uint32_t detect_flags( void ) { uint32_t flags = 0; #ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE if ( IsProcessorFeaturePresent( PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE ) ) flags |= X264_CPU_DOTPROD; #endif #ifdef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE if ( IsProcessorFeaturePresent( PF_ARM_SVE_INSTRUCTIONS_AVAILABLE ) ) flags |= X264_CPU_SVE; #endif #ifdef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE if ( IsProcessorFeaturePresent( PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE ) ) flags |= X264_CPU_SVE2; #endif #ifdef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE /* There's no PF_* flag that indicates whether plain I8MM is available * or not. But if SVE_I8MM is available, that also implies that * regular I8MM is available. */ if ( IsProcessorFeaturePresent( PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE ) ) flags |= X264_CPU_I8MM; #endif return flags; } #endif uint32_t x264_cpu_detect( void ) { uint32_t flags = X264_CPU_ARMV8; #if HAVE_NEON flags |= X264_CPU_NEON; #endif // If these features are enabled unconditionally in the compiler, we can // assume that they are available. #ifdef __ARM_FEATURE_DOTPROD flags |= X264_CPU_DOTPROD; #endif #ifdef __ARM_FEATURE_MATMUL_INT8 flags |= X264_CPU_I8MM; #endif #ifdef __ARM_FEATURE_SVE flags |= X264_CPU_SVE; #endif #ifdef __ARM_FEATURE_SVE2 flags |= X264_CPU_SVE2; #endif // Where possible, try to do runtime detection as well. #if defined(__linux__) || HAVE_ELF_AUX_INFO || \ defined(__APPLE__) || defined(_WIN32) flags |= detect_flags(); #endif return flags; } #elif HAVE_MSA uint32_t x264_cpu_detect( void ) { return X264_CPU_MSA; } #elif HAVE_LSX #define LA_HWCAP_LSX ( 1U << 4 ) #define LA_HWCAP_LASX ( 1U << 5 ) uint32_t x264_cpu_detect( void ) { uint32_t flags = 0; uint32_t hwcap = (uint32_t)x264_getauxval( AT_HWCAP ); if( hwcap & LA_HWCAP_LSX ) flags |= X264_CPU_LSX; if( hwcap & LA_HWCAP_LASX ) flags |= X264_CPU_LASX; return flags; } #else uint32_t x264_cpu_detect( void ) { return 0; } #endif int x264_cpu_num_processors( void ) { #if !HAVE_THREAD return 1; #elif SYS_WINDOWS return x264_pthread_num_processors_np(); #elif SYS_LINUX cpu_set_t p_aff; memset( &p_aff, 0, sizeof(p_aff) ); if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) ) return 1; #if HAVE_CPU_COUNT return CPU_COUNT(&p_aff); #else int np = 0; for( size_t bit = 0; bit < 8 * sizeof(p_aff); bit++ ) np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1; return np; #endif #elif SYS_BEOS system_info info; get_system_info( &info ); return info.cpu_count; #elif SYS_MACOSX int ncpu; size_t length = sizeof( ncpu ); if( sysctlbyname("hw.logicalcpu", &ncpu, &length, NULL, 0) ) { ncpu = 1; } return ncpu; #elif defined(_SC_NPROCESSORS_ONLN) return sysconf( _SC_NPROCESSORS_ONLN ); #elif defined(_SC_NPROCESSORS_CONF) return sysconf( _SC_NPROCESSORS_CONF ); #else return 1; #endif } x264-master/common/cpu.h000066400000000000000000000041471502133446700153150ustar00rootroot00000000000000/***************************************************************************** * cpu.h: cpu detection ***************************************************************************** * Copyright (C) 2004-2025 x264 project * * Authors: Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_CPU_H #define X264_CPU_H X264_API uint32_t x264_cpu_detect( void ); X264_API int x264_cpu_num_processors( void ); void x264_cpu_emms( void ); void x264_cpu_sfence( void ); #if HAVE_MMX /* There is no way to forbid the compiler from using float instructions * before the emms so miscompilation could theoretically occur in the * unlikely event that the compiler reorders emms and float instructions. */ #if HAVE_X86_INLINE_ASM /* Clobbering memory makes the compiler less likely to reorder code. */ #define x264_emms() asm volatile( "emms":::"memory","st","st(1)","st(2)", \ "st(3)","st(4)","st(5)","st(6)","st(7)" ) #else #define x264_emms() x264_cpu_emms() #endif #else #define x264_emms() #endif #define x264_sfence x264_cpu_sfence typedef struct { const char *name; uint32_t flags; } x264_cpu_name_t; X264_API extern const x264_cpu_name_t x264_cpu_names[]; #endif x264-master/common/dct.c000066400000000000000000001114761502133446700152770ustar00rootroot00000000000000/***************************************************************************** * dct.c: transform and zigzag ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #if HAVE_MMX # include "x86/dct.h" #endif #if HAVE_ALTIVEC # include "ppc/dct.h" #endif #if HAVE_ARMV6 # include "arm/dct.h" #endif #if HAVE_AARCH64 # include "aarch64/dct.h" #endif #if HAVE_MSA # include "mips/dct.h" #endif #if HAVE_LSX # include "loongarch/dct.h" #endif static void dct4x4dc( dctcoef d[16] ) { dctcoef tmp[16]; for( int i = 0; i < 4; i++ ) { int s01 = d[i*4+0] + d[i*4+1]; int d01 = d[i*4+0] - d[i*4+1]; int s23 = d[i*4+2] + d[i*4+3]; int d23 = d[i*4+2] - d[i*4+3]; tmp[0*4+i] = s01 + s23; tmp[1*4+i] = s01 - s23; tmp[2*4+i] = d01 - d23; tmp[3*4+i] = d01 + d23; } for( int i = 0; i < 4; i++ ) { int s01 = tmp[i*4+0] + tmp[i*4+1]; int d01 = tmp[i*4+0] - tmp[i*4+1]; int s23 = tmp[i*4+2] + tmp[i*4+3]; int d23 = tmp[i*4+2] - tmp[i*4+3]; d[i*4+0] = ( s01 + s23 + 1 ) >> 1; d[i*4+1] = ( s01 - s23 + 1 ) >> 1; d[i*4+2] = ( d01 - d23 + 1 ) >> 1; d[i*4+3] = ( d01 + d23 + 1 ) >> 1; } } static void idct4x4dc( dctcoef d[16] ) { dctcoef tmp[16]; for( int i = 0; i < 4; i++ ) { int s01 = d[i*4+0] + d[i*4+1]; int d01 = d[i*4+0] - d[i*4+1]; int s23 = d[i*4+2] + d[i*4+3]; int d23 = d[i*4+2] - d[i*4+3]; tmp[0*4+i] = s01 + s23; tmp[1*4+i] = s01 - s23; tmp[2*4+i] = d01 - d23; tmp[3*4+i] = d01 + d23; } for( int i = 0; i < 4; i++ ) { int s01 = tmp[i*4+0] + tmp[i*4+1]; int d01 = tmp[i*4+0] - tmp[i*4+1]; int s23 = tmp[i*4+2] + tmp[i*4+3]; int d23 = tmp[i*4+2] - tmp[i*4+3]; d[i*4+0] = s01 + s23; d[i*4+1] = s01 - s23; d[i*4+2] = d01 - d23; d[i*4+3] = d01 + d23; } } static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] ) { int a0 = dct4x4[0][0] + dct4x4[1][0]; int a1 = dct4x4[2][0] + dct4x4[3][0]; int a2 = dct4x4[4][0] + dct4x4[5][0]; int a3 = dct4x4[6][0] + dct4x4[7][0]; int a4 = dct4x4[0][0] - dct4x4[1][0]; int a5 = dct4x4[2][0] - dct4x4[3][0]; int a6 = dct4x4[4][0] - dct4x4[5][0]; int a7 = dct4x4[6][0] - dct4x4[7][0]; int b0 = a0 + a1; int b1 = a2 + a3; int b2 = a4 + a5; int b3 = a6 + a7; int b4 = a0 - a1; int b5 = a2 - a3; int b6 = a4 - a5; int b7 = a6 - a7; dct[0] = b0 + b1; dct[1] = b2 + b3; dct[2] = b0 - b1; dct[3] = b2 - b3; dct[4] = b4 - b5; dct[5] = b6 - b7; dct[6] = b4 + b5; dct[7] = b6 + b7; dct4x4[0][0] = 0; dct4x4[1][0] = 0; dct4x4[2][0] = 0; dct4x4[3][0] = 0; dct4x4[4][0] = 0; dct4x4[5][0] = 0; dct4x4[6][0] = 0; dct4x4[7][0] = 0; } static inline void pixel_sub_wxh( dctcoef *diff, int i_size, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) { for( int y = 0; y < i_size; y++ ) { for( int x = 0; x < i_size; x++ ) diff[x + y*i_size] = pix1[x] - pix2[x]; pix1 += i_pix1; pix2 += i_pix2; } } static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 ) { dctcoef d[16]; dctcoef tmp[16]; pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); for( int i = 0; i < 4; i++ ) { int s03 = d[i*4+0] + d[i*4+3]; int s12 = d[i*4+1] + d[i*4+2]; int d03 = d[i*4+0] - d[i*4+3]; int d12 = d[i*4+1] - d[i*4+2]; tmp[0*4+i] = s03 + s12; tmp[1*4+i] = 2*d03 + d12; tmp[2*4+i] = s03 - s12; tmp[3*4+i] = d03 - 2*d12; } for( int i = 0; i < 4; i++ ) { int s03 = tmp[i*4+0] + tmp[i*4+3]; int s12 = tmp[i*4+1] + tmp[i*4+2]; int d03 = tmp[i*4+0] - tmp[i*4+3]; int d12 = tmp[i*4+1] - tmp[i*4+2]; dct[i*4+0] = s03 + s12; dct[i*4+1] = 2*d03 + d12; dct[i*4+2] = s03 - s12; dct[i*4+3] = d03 - 2*d12; } } static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 ) { sub4x4_dct( dct[0], &pix1[0], &pix2[0] ); sub4x4_dct( dct[1], &pix1[4], &pix2[4] ); sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); } static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ) { sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] ); sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] ); sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); } static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 ) { int sum = 0; for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE ) sum += pix1[0] + pix1[1] + pix1[2] + pix1[3] - pix2[0] - pix2[1] - pix2[2] - pix2[3]; return sum; } static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 ) { dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] ); dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] ); dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] ); dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] ); /* 2x2 DC transform */ int d0 = dct[0] + dct[1]; int d1 = dct[2] + dct[3]; int d2 = dct[0] - dct[1]; int d3 = dct[2] - dct[3]; dct[0] = d0 + d1; dct[1] = d0 - d1; dct[2] = d2 + d3; dct[3] = d2 - d3; } static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 ) { int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] ); int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] ); int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] ); int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] ); int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] ); int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] ); int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] ); int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] ); /* 2x4 DC transform */ int b0 = a0 + a1; int b1 = a2 + a3; int b2 = a4 + a5; int b3 = a6 + a7; int b4 = a0 - a1; int b5 = a2 - a3; int b6 = a4 - a5; int b7 = a6 - a7; a0 = b0 + b1; a1 = b2 + b3; a2 = b4 + b5; a3 = b6 + b7; a4 = b0 - b1; a5 = b2 - b3; a6 = b4 - b5; a7 = b6 - b7; dct[0] = a0 + a1; dct[1] = a2 + a3; dct[2] = a0 - a1; dct[3] = a2 - a3; dct[4] = a4 - a5; dct[5] = a6 - a7; dct[6] = a4 + a5; dct[7] = a6 + a7; } static void add4x4_idct( pixel *p_dst, dctcoef dct[16] ) { dctcoef d[16]; dctcoef tmp[16]; for( int i = 0; i < 4; i++ ) { int s02 = dct[0*4+i] + dct[2*4+i]; int d02 = dct[0*4+i] - dct[2*4+i]; int s13 = dct[1*4+i] + (dct[3*4+i]>>1); int d13 = (dct[1*4+i]>>1) - dct[3*4+i]; tmp[i*4+0] = s02 + s13; tmp[i*4+1] = d02 + d13; tmp[i*4+2] = d02 - d13; tmp[i*4+3] = s02 - s13; } for( int i = 0; i < 4; i++ ) { int s02 = tmp[0*4+i] + tmp[2*4+i]; int d02 = tmp[0*4+i] - tmp[2*4+i]; int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1); int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i]; d[0*4+i] = ( s02 + s13 + 32 ) >> 6; d[1*4+i] = ( d02 + d13 + 32 ) >> 6; d[2*4+i] = ( d02 - d13 + 32 ) >> 6; d[3*4+i] = ( s02 - s13 + 32 ) >> 6; } for( int y = 0; y < 4; y++ ) { for( int x = 0; x < 4; x++ ) p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] ); p_dst += FDEC_STRIDE; } } static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] ) { add4x4_idct( &p_dst[0], dct[0] ); add4x4_idct( &p_dst[4], dct[1] ); add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] ); add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] ); } static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] ) { add8x8_idct( &p_dst[0], &dct[0] ); add8x8_idct( &p_dst[8], &dct[4] ); add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] ); add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] ); } /**************************************************************************** * 8x8 transform: ****************************************************************************/ #define DCT8_1D {\ int s07 = SRC(0) + SRC(7);\ int s16 = SRC(1) + SRC(6);\ int s25 = SRC(2) + SRC(5);\ int s34 = SRC(3) + SRC(4);\ int a0 = s07 + s34;\ int a1 = s16 + s25;\ int a2 = s07 - s34;\ int a3 = s16 - s25;\ int d07 = SRC(0) - SRC(7);\ int d16 = SRC(1) - SRC(6);\ int d25 = SRC(2) - SRC(5);\ int d34 = SRC(3) - SRC(4);\ int a4 = d16 + d25 + (d07 + (d07>>1));\ int a5 = d07 - d34 - (d25 + (d25>>1));\ int a6 = d07 + d34 - (d16 + (d16>>1));\ int a7 = d16 - d25 + (d34 + (d34>>1));\ DST(0) = a0 + a1 ;\ DST(1) = a4 + (a7>>2);\ DST(2) = a2 + (a3>>1);\ DST(3) = a5 + (a6>>2);\ DST(4) = a0 - a1 ;\ DST(5) = a6 - (a5>>2);\ DST(6) = (a2>>1) - a3 ;\ DST(7) = (a4>>2) - a7 ;\ } static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 ) { dctcoef tmp[64]; pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); #define SRC(x) tmp[x*8+i] #define DST(x) tmp[x*8+i] for( int i = 0; i < 8; i++ ) DCT8_1D #undef SRC #undef DST #define SRC(x) tmp[i*8+x] #define DST(x) dct[x*8+i] for( int i = 0; i < 8; i++ ) DCT8_1D #undef SRC #undef DST } static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ) { sub8x8_dct8( dct[0], &pix1[0], &pix2[0] ); sub8x8_dct8( dct[1], &pix1[8], &pix2[8] ); sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); } #define IDCT8_1D {\ int a0 = SRC(0) + SRC(4);\ int a2 = SRC(0) - SRC(4);\ int a4 = (SRC(2)>>1) - SRC(6);\ int a6 = (SRC(6)>>1) + SRC(2);\ int b0 = a0 + a6;\ int b2 = a2 + a4;\ int b4 = a2 - a4;\ int b6 = a0 - a6;\ int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\ int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\ int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\ int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\ int b1 = (a7>>2) + a1;\ int b3 = a3 + (a5>>2);\ int b5 = (a3>>2) - a5;\ int b7 = a7 - (a1>>2);\ DST(0, b0 + b7);\ DST(1, b2 + b5);\ DST(2, b4 + b3);\ DST(3, b6 + b1);\ DST(4, b6 - b1);\ DST(5, b4 - b3);\ DST(6, b2 - b5);\ DST(7, b0 - b7);\ } static void add8x8_idct8( pixel *dst, dctcoef dct[64] ) { dct[0] += 32; // rounding for the >>6 at the end #define SRC(x) dct[x*8+i] #define DST(x,rhs) dct[x*8+i] = (rhs) for( int i = 0; i < 8; i++ ) IDCT8_1D #undef SRC #undef DST #define SRC(x) dct[i*8+x] #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) ); for( int i = 0; i < 8; i++ ) IDCT8_1D #undef SRC #undef DST } static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] ) { add8x8_idct8( &dst[0], dct[0] ); add8x8_idct8( &dst[8], dct[1] ); add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] ); add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] ); } static inline void add4x4_idct_dc( pixel *p_dst, dctcoef dc ) { dc = (dc + 32) >> 6; for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE ) { p_dst[0] = x264_clip_pixel( p_dst[0] + dc ); p_dst[1] = x264_clip_pixel( p_dst[1] + dc ); p_dst[2] = x264_clip_pixel( p_dst[2] + dc ); p_dst[3] = x264_clip_pixel( p_dst[3] + dc ); } } static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] ) { add4x4_idct_dc( &p_dst[0], dct[0] ); add4x4_idct_dc( &p_dst[4], dct[1] ); add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] ); add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] ); } static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] ) { for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE ) { add4x4_idct_dc( &p_dst[ 0], dct[0] ); add4x4_idct_dc( &p_dst[ 4], dct[1] ); add4x4_idct_dc( &p_dst[ 8], dct[2] ); add4x4_idct_dc( &p_dst[12], dct[3] ); } } /**************************************************************************** * x264_dct_init: ****************************************************************************/ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ) { dctf->sub4x4_dct = sub4x4_dct; dctf->add4x4_idct = add4x4_idct; dctf->sub8x8_dct = sub8x8_dct; dctf->sub8x8_dct_dc = sub8x8_dct_dc; dctf->add8x8_idct = add8x8_idct; dctf->add8x8_idct_dc = add8x8_idct_dc; dctf->sub8x16_dct_dc = sub8x16_dct_dc; dctf->sub16x16_dct = sub16x16_dct; dctf->add16x16_idct = add16x16_idct; dctf->add16x16_idct_dc = add16x16_idct_dc; dctf->sub8x8_dct8 = sub8x8_dct8; dctf->add8x8_idct8 = add8x8_idct8; dctf->sub16x16_dct8 = sub16x16_dct8; dctf->add16x16_idct8 = add16x16_idct8; dctf->dct4x4dc = dct4x4dc; dctf->idct4x4dc = idct4x4dc; dctf->dct2x4dc = dct2x4dc; #if HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { dctf->sub4x4_dct = x264_sub4x4_dct_mmx; dctf->sub8x8_dct = x264_sub8x8_dct_mmx; dctf->sub16x16_dct = x264_sub16x16_dct_mmx; } if( cpu&X264_CPU_SSE2 ) { dctf->add4x4_idct = x264_add4x4_idct_sse2; dctf->dct4x4dc = x264_dct4x4dc_sse2; dctf->idct4x4dc = x264_idct4x4dc_sse2; dctf->dct2x4dc = x264_dct2x4dc_sse2; dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; dctf->add16x16_idct = x264_add16x16_idct_sse2; dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8 = x264_add16x16_idct8_sse2; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2; dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2; dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2; } if( cpu&X264_CPU_SSE4 ) { dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4; dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4; } if( cpu&X264_CPU_AVX ) { dctf->add4x4_idct = x264_add4x4_idct_avx; dctf->dct4x4dc = x264_dct4x4dc_avx; dctf->idct4x4dc = x264_idct4x4dc_avx; dctf->dct2x4dc = x264_dct2x4dc_avx; dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; dctf->add8x8_idct = x264_add8x8_idct_avx; dctf->add16x16_idct = x264_add16x16_idct_avx; dctf->add8x8_idct8 = x264_add8x8_idct8_avx; dctf->add16x16_idct8 = x264_add16x16_idct8_avx; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx; dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx; dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { dctf->sub4x4_dct = x264_sub4x4_dct_mmx; dctf->add4x4_idct = x264_add4x4_idct_mmx; dctf->idct4x4dc = x264_idct4x4dc_mmx; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2; #if !ARCH_X86_64 dctf->sub8x8_dct = x264_sub8x8_dct_mmx; dctf->sub16x16_dct = x264_sub16x16_dct_mmx; dctf->add8x8_idct = x264_add8x8_idct_mmx; dctf->add16x16_idct = x264_add16x16_idct_mmx; dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx; dctf->add8x8_idct8 = x264_add8x8_idct8_mmx; dctf->add16x16_idct8= x264_add16x16_idct8_mmx; #endif } if( cpu&X264_CPU_MMX2 ) { dctf->dct4x4dc = x264_dct4x4dc_mmx2; dctf->dct2x4dc = x264_dct2x4dc_mmx2; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2; } if( cpu&X264_CPU_SSE2 ) { dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2; dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2; dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2; dctf->add8x8_idct8 = x264_add8x8_idct8_sse2; dctf->add16x16_idct8= x264_add16x16_idct8_sse2; if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) { dctf->sub8x8_dct = x264_sub8x8_dct_sse2; dctf->sub16x16_dct = x264_sub16x16_dct_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; dctf->add16x16_idct = x264_add16x16_idct_sse2; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; } } if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3; if( !(cpu&X264_CPU_SLOW_ATOM) ) { dctf->sub4x4_dct = x264_sub4x4_dct_ssse3; dctf->sub8x8_dct = x264_sub8x8_dct_ssse3; dctf->sub16x16_dct = x264_sub16x16_dct_ssse3; dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3; dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) { dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; } } } if( cpu&X264_CPU_SSE4 ) dctf->add4x4_idct = x264_add4x4_idct_sse4; if( cpu&X264_CPU_AVX ) { dctf->add4x4_idct = x264_add4x4_idct_avx; dctf->add8x8_idct = x264_add8x8_idct_avx; dctf->add16x16_idct = x264_add16x16_idct_avx; dctf->add8x8_idct8 = x264_add8x8_idct8_avx; dctf->add16x16_idct8 = x264_add16x16_idct8_avx; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx; dctf->sub8x8_dct = x264_sub8x8_dct_avx; dctf->sub16x16_dct = x264_sub16x16_dct_avx; dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx; } if( cpu&X264_CPU_XOP ) { dctf->sub8x8_dct = x264_sub8x8_dct_xop; dctf->sub16x16_dct = x264_sub16x16_dct_xop; } if( cpu&X264_CPU_AVX2 ) { dctf->add8x8_idct = x264_add8x8_idct_avx2; dctf->add16x16_idct = x264_add16x16_idct_avx2; dctf->sub8x8_dct = x264_sub8x8_dct_avx2; dctf->sub16x16_dct = x264_sub16x16_dct_avx2; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2; #if ARCH_X86_64 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; #endif } if( cpu&X264_CPU_AVX512 ) { dctf->sub4x4_dct = x264_sub4x4_dct_avx512; dctf->sub8x8_dct = x264_sub8x8_dct_avx512; dctf->sub16x16_dct = x264_sub16x16_dct_avx512; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512; dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512; dctf->add8x8_idct = x264_add8x8_idct_avx512; } #endif //HAVE_MMX #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { dctf->sub4x4_dct = x264_sub4x4_dct_altivec; dctf->sub8x8_dct = x264_sub8x8_dct_altivec; dctf->sub16x16_dct = x264_sub16x16_dct_altivec; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_altivec; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_altivec; dctf->add4x4_idct = x264_add4x4_idct_altivec; dctf->add8x8_idct = x264_add8x8_idct_altivec; dctf->add16x16_idct = x264_add16x16_idct_altivec; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_altivec; dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec; dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec; dctf->add8x8_idct8 = x264_add8x8_idct8_altivec; dctf->add16x16_idct8= x264_add16x16_idct8_altivec; } #endif #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { dctf->sub4x4_dct = x264_sub4x4_dct_neon; dctf->sub8x8_dct = x264_sub8x8_dct_neon; dctf->sub16x16_dct = x264_sub16x16_dct_neon; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon; dctf->dct4x4dc = x264_dct4x4dc_neon; dctf->idct4x4dc = x264_idct4x4dc_neon; dctf->add4x4_idct = x264_add4x4_idct_neon; dctf->add8x8_idct = x264_add8x8_idct_neon; dctf->add16x16_idct = x264_add16x16_idct_neon; dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon; dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon; dctf->add8x8_idct8 = x264_add8x8_idct8_neon; dctf->add16x16_idct8= x264_add16x16_idct8_neon; dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; } #if HAVE_SVE if ( cpu&X264_CPU_SVE ) { dctf->sub4x4_dct = x264_sub4x4_dct_sve; } #endif #if HAVE_SVE2 if ( cpu&X264_CPU_SVE2 ) { dctf->add4x4_idct = x264_add4x4_idct_sve2; } #endif #endif #if HAVE_MSA if( cpu&X264_CPU_MSA ) { dctf->sub4x4_dct = x264_sub4x4_dct_msa; dctf->sub8x8_dct = x264_sub8x8_dct_msa; dctf->sub16x16_dct = x264_sub16x16_dct_msa; dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa; dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa; dctf->dct4x4dc = x264_dct4x4dc_msa; dctf->idct4x4dc = x264_idct4x4dc_msa; dctf->add4x4_idct = x264_add4x4_idct_msa; dctf->add8x8_idct = x264_add8x8_idct_msa; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa; dctf->add16x16_idct = x264_add16x16_idct_msa; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa; dctf->add8x8_idct8 = x264_add8x8_idct8_msa; dctf->add16x16_idct8 = x264_add16x16_idct8_msa; } #endif #if HAVE_LSX if( cpu&X264_CPU_LSX ) { dctf->sub4x4_dct = x264_sub4x4_dct_lsx; dctf->add4x4_idct = x264_add4x4_idct_lsx; dctf->dct4x4dc = x264_dct4x4dc_lsx; dctf->idct4x4dc = x264_idct4x4dc_lsx; dctf->sub8x8_dct8 = x264_sub8x8_dct8_lsx; dctf->sub8x8_dct = x264_sub8x8_dct_lsx; dctf->add8x8_idct = x264_add8x8_idct_lsx; dctf->add8x8_idct8 = x264_add8x8_idct8_lsx; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lsx; dctf->add16x16_idct = x264_add16x16_idct_lsx; dctf->sub16x16_dct = x264_sub16x16_dct_lsx; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lsx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_lsx; } if( cpu&X264_CPU_LASX ) { dctf->sub8x8_dct = x264_sub8x8_dct_lasx; dctf->sub16x16_dct = x264_sub16x16_dct_lasx; dctf->add8x8_idct = x264_add8x8_idct_lasx; dctf->add8x8_idct8 = x264_add8x8_idct8_lasx; dctf->add16x16_idct = x264_add16x16_idct_lasx; dctf->sub16x16_dct8 = x264_sub16x16_dct8_lasx; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lasx; dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lasx; dctf->dct4x4dc = x264_dct4x4dc_lasx; dctf->idct4x4dc = x264_idct4x4dc_lasx; } #endif #endif // HIGH_BIT_DEPTH } #define ZIG(i,y,x) level[i] = dct[x*8+y]; #define ZIGZAG8_FRAME\ ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\ ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\ ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\ ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\ ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\ ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\ ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\ ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\ ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\ ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\ ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\ ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\ ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\ ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\ #define ZIGZAG8_FIELD\ ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\ ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\ ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\ ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\ ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\ ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\ ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\ ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\ ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\ ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\ ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\ ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\ ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\ ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\ ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\ ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) #define ZIGZAG4_FRAME\ ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\ ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) #define ZIGZAG4_FIELD\ ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\ ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\ ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] ) { ZIGZAG8_FRAME } static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] ) { ZIGZAG8_FIELD } #undef ZIG #define ZIG(i,y,x) level[i] = dct[x*4+y]; #define ZIGDC(i,y,x) ZIG(i,y,x) static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) { ZIGZAG4_FRAME } static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) { memcpy( level, dct, 2 * sizeof(dctcoef) ); ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) memcpy( level+6, dct+6, 10 * sizeof(dctcoef) ); } #undef ZIG #define ZIG(i,y,x) {\ int oe = x+y*FENC_STRIDE;\ int od = x+y*FDEC_STRIDE;\ level[i] = p_src[oe] - p_dst[od];\ nz |= level[i];\ } #define COPY4x4\ CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE ); #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) ) #define COPY8x8\ CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\ CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\ CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\ CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\ CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE ); static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst ) { int nz = 0; ZIGZAG4_FRAME COPY4x4 return !!nz; } static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst ) { int nz = 0; ZIGZAG4_FIELD COPY4x4 return !!nz; } #undef ZIGDC #define ZIGDC(i,y,x) {\ int oe = x+y*FENC_STRIDE;\ int od = x+y*FDEC_STRIDE;\ *dc = p_src[oe] - p_dst[od];\ level[0] = 0;\ } static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ) { int nz = 0; ZIGZAG4_FRAME COPY4x4 return !!nz; } static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ) { int nz = 0; ZIGZAG4_FIELD COPY4x4 return !!nz; } static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst ) { int nz = 0; ZIGZAG8_FRAME COPY8x8 return !!nz; } static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst ) { int nz = 0; ZIGZAG8_FIELD COPY8x8 return !!nz; } #undef ZIG #undef COPY4x4 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz ) { for( int i = 0; i < 4; i++ ) { int nz = 0; for( int j = 0; j < 16; j++ ) { nz |= src[i+j*4]; dst[i*16+j] = src[i+j*4]; } nnz[(i&1) + (i>>1)*8] = !!nz; } } void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ) { pf_interlaced->scan_8x8 = zigzag_scan_8x8_field; pf_progressive->scan_8x8 = zigzag_scan_8x8_frame; pf_interlaced->scan_4x4 = zigzag_scan_4x4_field; pf_progressive->scan_4x4 = zigzag_scan_4x4_frame; pf_interlaced->sub_8x8 = zigzag_sub_8x8_field; pf_progressive->sub_8x8 = zigzag_sub_8x8_frame; pf_interlaced->sub_4x4 = zigzag_sub_4x4_field; pf_progressive->sub_4x4 = zigzag_sub_4x4_frame; pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field; pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame; #if HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_SSE2 ) { pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2; pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; } if( cpu&X264_CPU_SSE4 ) pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4; if( cpu&X264_CPU_AVX ) pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx; #if ARCH_X86_64 if( cpu&X264_CPU_AVX ) { pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; } #endif // ARCH_X86_64 if( cpu&X264_CPU_AVX512 ) { pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; } #endif // HAVE_MMX #else #if HAVE_MMX if( cpu&X264_CPU_MMX ) pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; if( cpu&X264_CPU_MMX2 ) { pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2; } if( cpu&X264_CPU_SSE ) pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse; if( cpu&X264_CPU_SSE2_IS_FAST ) pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2; if( cpu&X264_CPU_SSSE3 ) { pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3; pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3; pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; if( !(cpu&X264_CPU_SLOW_SHUFFLE) ) pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; } if( cpu&X264_CPU_AVX ) { pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx; pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx; #if ARCH_X86_64 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx; pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx; #endif pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx; } if( cpu&X264_CPU_XOP ) { pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; } if( cpu&X264_CPU_AVX512 ) { pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; } #endif // HAVE_MMX #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_altivec; } #endif #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; #if HAVE_AARCH64 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon; pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon; pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon; pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon; pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon; pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon; pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon; #endif // HAVE_AARCH64 } #endif // HAVE_ARMV6 || HAVE_AARCH64 #endif // HIGH_BIT_DEPTH pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; #if HAVE_MMX #if HIGH_BIT_DEPTH if( cpu&X264_CPU_SSE2 ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; } if( cpu&X264_CPU_AVX ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; } if( cpu&X264_CPU_AVX512 ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; } #else if( cpu&X264_CPU_MMX ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; } if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; } if( cpu&X264_CPU_AVX ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; } if( cpu&X264_CPU_AVX2 ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; } if( cpu&X264_CPU_AVX512 ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; } #endif // HIGH_BIT_DEPTH #endif #if !HIGH_BIT_DEPTH #if HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; } #if HAVE_SVE if( cpu&X264_CPU_SVE ) { pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sve; } #endif #endif // HAVE_AARCH64 #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_altivec; } #endif // HAVE_ALTIVEC #if HAVE_MSA if( cpu&X264_CPU_MSA ) { pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa; } #endif #if HAVE_LSX if( cpu&X264_CPU_LASX ) { pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_lasx; } #endif #endif // !HIGH_BIT_DEPTH } x264-master/common/dct.h000066400000000000000000000063171502133446700153010ustar00rootroot00000000000000/***************************************************************************** * dct.h: transform and zigzag ***************************************************************************** * Copyright (C) 2004-2025 x264 project * * Authors: Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_DCT_H #define X264_DCT_H typedef struct { // pix1 stride = FENC_STRIDE // pix2 stride = FDEC_STRIDE // p_dst stride = FDEC_STRIDE void (*sub4x4_dct) ( dctcoef dct[16], pixel *pix1, pixel *pix2 ); void (*add4x4_idct)( pixel *p_dst, dctcoef dct[16] ); void (*sub8x8_dct) ( dctcoef dct[4][16], pixel *pix1, pixel *pix2 ); void (*sub8x8_dct_dc) ( dctcoef dct[4], pixel *pix1, pixel *pix2 ); void (*add8x8_idct) ( pixel *p_dst, dctcoef dct[4][16] ); void (*add8x8_idct_dc)( pixel *p_dst, dctcoef dct[4] ); void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 ); void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ); void (*add16x16_idct) ( pixel *p_dst, dctcoef dct[16][16] ); void (*add16x16_idct_dc)( pixel *p_dst, dctcoef dct[16] ); void (*sub8x8_dct8) ( dctcoef dct[64], pixel *pix1, pixel *pix2 ); void (*add8x8_idct8)( pixel *p_dst, dctcoef dct[64] ); void (*sub16x16_dct8) ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); void (*add16x16_idct8)( pixel *p_dst, dctcoef dct[4][64] ); void (*dct4x4dc) ( dctcoef d[16] ); void (*idct4x4dc)( dctcoef d[16] ); void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] ); } x264_dct_function_t; typedef struct { void (*scan_8x8)( dctcoef level[64], dctcoef dct[64] ); void (*scan_4x4)( dctcoef level[16], dctcoef dct[16] ); int (*sub_8x8) ( dctcoef level[64], const pixel *p_src, pixel *p_dst ); int (*sub_4x4) ( dctcoef level[16], const pixel *p_src, pixel *p_dst ); int (*sub_4x4ac)( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); void (*interleave_8x8_cavlc)( dctcoef *dst, dctcoef *src, uint8_t *nnz ); } x264_zigzag_function_t; #define x264_dct_init x264_template(dct_init) void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ); #define x264_zigzag_init x264_template(zigzag_init) void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ); #endif x264-master/common/deblock.c000066400000000000000000001062471502133446700161300ustar00rootroot00000000000000/***************************************************************************** * deblock.c: deblocking ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" /* Deblocking filter */ static const uint8_t i_alpha_table[52+12*3] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 17, 20, 22, 25, 28, 32, 36, 40, 45, 50, 56, 63, 71, 80, 90,101,113,127,144,162,182,203,226, 255,255, 255,255,255,255,255,255,255,255,255,255,255,255, }; static const uint8_t i_beta_table[52+12*3] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, }; static const int8_t i_tc0_table[52+12*3][4] = { {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 }, {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 }, {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 }, {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, }; #define alpha_table(x) i_alpha_table[(x)+24] #define beta_table(x) i_beta_table[(x)+24] #define tc0_table(x) i_tc0_table[(x)+24] /* From ffmpeg */ static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 ) { int p2 = pix[-3*xstride]; int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; int q0 = pix[ 0*xstride]; int q1 = pix[ 1*xstride]; int q2 = pix[ 2*xstride]; if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) { int tc = tc0; int delta; if( abs( p2 - p0 ) < beta ) { if( tc0 ) pix[-2*xstride] = p1 + x264_clip3( (( p2 + ((p0 + q0 + 1) >> 1)) >> 1) - p1, -tc0, tc0 ); tc++; } if( abs( q2 - q0 ) < beta ) { if( tc0 ) pix[ 1*xstride] = q1 + x264_clip3( (( q2 + ((p0 + q0 + 1) >> 1)) >> 1) - q1, -tc0, tc0 ); tc++; } delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc ); pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { if( tc0[i] < 0 ) { pix += 4*ystride; continue; } for( int d = 0; d < 4; d++, pix += ystride ) deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] ); } } static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { for( int d = 0; d < 8; d++, pix += stride ) deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] ); } static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); } static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, 1, stride, alpha, beta, tc0 ); } static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; int q0 = pix[ 0*xstride]; int q1 = pix[ 1*xstride]; if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) { int delta = x264_clip3( (((q0 - p0 ) * 4) + (p1 - q1) + 4) >> 3, -tc, tc ); pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */ pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { int tc = tc0[i]; if( tc <= 0 ) { pix += height*ystride; continue; } for( int d = 0; d < height; d++, pix += ystride-2 ) for( int e = 0; e < 2; e++, pix++ ) deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); } } static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 ); } static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 ); } static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 ); } static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 ); } static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta ) { int p2 = pix[-3*xstride]; int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; int q0 = pix[ 0*xstride]; int q1 = pix[ 1*xstride]; int q2 = pix[ 2*xstride]; if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) { if( abs( p0 - q0 ) < ((alpha >> 2) + 2) ) { if( abs( p2 - p0 ) < beta ) /* p0', p1', p2' */ { const int p3 = pix[-4*xstride]; pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3; pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2; pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3; } else /* p0' */ pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; if( abs( q2 - q0 ) < beta ) /* q0', q1', q2' */ { const int q3 = pix[3*xstride]; pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3; pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3; } else /* q0' */ pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } else /* p0', q0' */ { pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2; } } } static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < 16; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, xstride, alpha, beta ); } static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < 8; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, 1, alpha, beta ); } static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_luma_intra_c( pix, stride, 1, alpha, beta ); } static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; int q0 = pix[ 0*xstride]; int q1 = pix[ 1*xstride]; if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta ) { pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */ pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } } static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < height; d++, pix += ystride-2 ) for( int e = 0; e < width; e++, pix++ ) deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); } static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta ); } static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta ); } static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta ); } static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta ); } static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ) { for( int dir = 0; dir < 2; dir++ ) { int s1 = dir ? 1 : 8; int s2 = dir ? 8 : 1; for( int edge = 0; edge < 4; edge++ ) for( int i = 0, loc = X264_SCAN8_0+edge*s2; i < 4; i++, loc += s1 ) { int locn = loc - s2; if( nnz[loc] || nnz[locn] ) bs[dir][edge][i] = 2; else if( ref[0][loc] != ref[0][locn] || abs( mv[0][loc][0] - mv[0][locn][0] ) >= 4 || abs( mv[0][loc][1] - mv[0][locn][1] ) >= mvy_limit || (bframe && (ref[1][loc] != ref[1][locn] || abs( mv[1][loc][0] - mv[1][locn][0] ) >= 4 || abs( mv[1][loc][1] - mv[1][locn][1] ) >= mvy_limit ))) { bs[dir][edge][i] = 1; } else bs[dir][edge][i] = 0; } } } static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp + a; int index_b = i_qp + b; int alpha = alpha_table(index_a) << (BIT_DEPTH-8); int beta = beta_table(index_b) << (BIT_DEPTH-8); int8_t tc[4]; if( !M32(bS) || !alpha || !beta ) return; tc[0] = (tc0_table(index_a)[bS[0]] * (1 << (BIT_DEPTH-8))) + b_chroma; tc[1] = (tc0_table(index_a)[bS[1]] * (1 << (BIT_DEPTH-8))) + b_chroma; tc[2] = (tc0_table(index_a)[bS[2]] * (1 << (BIT_DEPTH-8))) + b_chroma; tc[3] = (tc0_table(index_a)[bS[3]] * (1 << (BIT_DEPTH-8))) + b_chroma; pf_inter( pix, i_stride, alpha, beta, tc ); } static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra ) { int index_a = i_qp + a; int index_b = i_qp + b; int alpha = alpha_table(index_a) << (BIT_DEPTH-8); int beta = beta_table(index_b) << (BIT_DEPTH-8); if( !alpha || !beta ) return; pf_intra( pix, i_stride, alpha, beta ); } static ALWAYS_INLINE void macroblock_cache_load_neighbours_deblock( x264_t *h, int mb_x, int mb_y ) { int deblock_on_slice_edges = h->sh.i_disable_deblocking_filter_idc != 2; h->mb.i_neighbour = 0; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; h->mb.b_interlaced = PARAM_INTERLACED && h->mb.field[h->mb.i_mb_xy]; h->mb.i_mb_top_y = mb_y - (1 << MB_INTERLACED); h->mb.i_mb_top_xy = mb_x + h->mb.i_mb_stride*h->mb.i_mb_top_y; h->mb.i_mb_left_xy[1] = h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1; if( SLICE_MBAFF ) { if( mb_y&1 ) { if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride; } else { if( h->mb.i_mb_top_xy >= 0 && MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy] ) { h->mb.i_mb_top_xy += h->mb.i_mb_stride; h->mb.i_mb_top_y++; } if( mb_x && h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride; } } if( mb_x > 0 && (deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_left_xy[0]] == h->mb.slice_table[h->mb.i_mb_xy]) ) h->mb.i_neighbour |= MB_LEFT; if( mb_y > MB_INTERLACED && (deblock_on_slice_edges || h->mb.slice_table[h->mb.i_mb_top_xy] == h->mb.slice_table[h->mb.i_mb_xy]) ) h->mb.i_neighbour |= MB_TOP; } void x264_frame_deblock_row( x264_t *h, int mb_y ) { int b_interlaced = SLICE_MBAFF; int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET; int b = h->sh.i_beta_offset - QP_BD_OFFSET; int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); int stridey = h->fdec->i_stride[0]; int strideuv = h->fdec->i_stride[1]; int chroma_format = CHROMA_FORMAT; int chroma444 = CHROMA444; int chroma_height = 16 >> CHROMA_V_SHIFT; intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1; for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced ) { x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); macroblock_cache_load_neighbours_deblock( h, mb_x, mb_y ); int mb_xy = h->mb.i_mb_xy; int transform_8x8 = h->mb.mb_transform_size[mb_xy]; int intra_cur = IS_INTRA( h->mb.type[mb_xy] ); uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?mb_xy:mb_x]; pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; pixel *pixuv = CHROMA_FORMAT ? h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x : NULL; if( mb_y & MB_INTERLACED ) { pixy -= 15*stridey; if( CHROMA_FORMAT ) pixuv -= (chroma_height-1)*strideuv; } int stride2y = stridey << MB_INTERLACED; int stride2uv = strideuv << MB_INTERLACED; int qp = h->mb.qp[mb_xy]; int qpc = h->chroma_qp_table[qp]; int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh; #define FILTER( intra, dir, edge, qp, chroma_qp )\ do\ {\ if( !(edge & 1) || !transform_8x8 )\ {\ deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\ stride2y, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ if( chroma_format == CHROMA_444 )\ {\ deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\ stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\ stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ }\ else if( chroma_format == CHROMA_420 && !(edge & 1) )\ {\ deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\ stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ h->loopf.deblock_chroma##intra[dir] );\ }\ }\ if( chroma_format == CHROMA_422 && (dir || !(edge & 1)) )\ {\ deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\ stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ h->loopf.deblock_chroma##intra[dir] );\ }\ } while( 0 ) if( h->mb.i_neighbour & MB_LEFT ) { if( b_interlaced && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) { int luma_qp[2]; int chroma_qp[2]; int left_qp[2]; x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff; x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff; x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff; x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff; int c = chroma444 ? 0 : 1; left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]]; luma_qp[0] = (qp + left_qp[0] + 1) >> 1; chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1; if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[0]] ) ) { deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock ); if( chroma_format ) { deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); if( chroma444 ) deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); } } else { deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock ); if( chroma_format ) { deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); if( chroma444 ) deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); } } int offy = MB_INTERLACED ? 4 : 0; int offuv = MB_INTERLACED ? 4-CHROMA_V_SHIFT : 0; left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]]; luma_qp[1] = (qp + left_qp[1] + 1) >> 1; chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1; if( intra_cur || IS_INTRA( h->mb.type[h->mb.i_mb_left_xy[1]] ) ) { deblock_edge_intra( h, pixy + (stridey<mb.qp[h->mb.i_mb_xy-1]; int qp_left = (qp + qpl + 1) >> 1; int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1; int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] ); int intra_deblock = intra_cur || intra_left; /* Any MB that was coded, or that analysis decided to skip, has quality commensurate with its QP. * But if deblocking affects neighboring MBs that were force-skipped, blur might accumulate there. * So reset their effective QP to max, to indicate that lack of guarantee. */ if( h->fdec->mb_info && M32( bs[0][0] ) ) { #define RESET_EFFECTIVE_QP(xy) h->fdec->effective_qp[xy] |= 0xff * !!(h->fdec->mb_info[xy] & X264_MBINFO_CONSTANT); RESET_EFFECTIVE_QP(mb_xy); RESET_EFFECTIVE_QP(h->mb.i_mb_left_xy[0]); } if( intra_deblock ) FILTER( _intra, 0, 0, qp_left, qpc_left ); else FILTER( , 0, 0, qp_left, qpc_left ); } } if( !first_edge_only ) { FILTER( , 0, 1, qp, qpc ); FILTER( , 0, 2, qp, qpc ); FILTER( , 0, 3, qp, qpc ); } if( h->mb.i_neighbour & MB_TOP ) { if( b_interlaced && !(mb_y&1) && !MB_INTERLACED && h->mb.field[h->mb.i_mb_top_xy] ) { int mbn_xy = mb_xy - 2 * h->mb.i_mb_stride; for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) { int qpt = h->mb.qp[mbn_xy]; int qp_top = (qp + qpt + 1) >> 1; int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1; int intra_top = IS_INTRA( h->mb.type[mbn_xy] ); if( intra_cur || intra_top ) M32( bs[1][4*j] ) = 0x03030303; // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, a, b, 0, h->loopf.deblock_luma[1] ); if( chroma444 ) { deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] ); deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, h->loopf.deblock_luma[1] ); } else if( chroma_format ) deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, h->loopf.deblock_chroma[1] ); } } else { int qpt = h->mb.qp[h->mb.i_mb_top_xy]; int qp_top = (qp + qpt + 1) >> 1; int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1; int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); int intra_deblock = intra_cur || intra_top; /* This edge has been modified, reset effective qp to max. */ if( h->fdec->mb_info && M32( bs[1][0] ) ) { RESET_EFFECTIVE_QP(mb_xy); RESET_EFFECTIVE_QP(h->mb.i_mb_top_xy); } if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) && intra_deblock ) { FILTER( _intra, 1, 0, qp_top, qpc_top ); } else { if( intra_deblock ) M32( bs[1][0] ) = 0x03030303; FILTER( , 1, 0, qp_top, qpc_top ); } } } if( !first_edge_only ) { FILTER( , 1, 1, qp, qpc ); FILTER( , 1, 2, qp, qpc ); FILTER( , 1, 3, qp, qpc ); } #undef FILTER } } /* For deblock-aware RD. * TODO: * deblock macroblock edges * support analysis partitions smaller than 16x16 * deblock chroma for 4:2:0/4:2:2 * handle duplicate refs correctly */ void x264_macroblock_deblock( x264_t *h ) { int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET; int b = h->sh.i_beta_offset - QP_BD_OFFSET; int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); int intra_cur = IS_INTRA( h->mb.i_type ); int qp = h->mb.i_qp; int qpc = h->mb.i_chroma_qp; if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh ) return; uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength; if( intra_cur ) { M32( bs[0][1] ) = 0x03030303; M64( bs[0][2] ) = 0x0303030303030303ULL; M32( bs[1][1] ) = 0x03030303; M64( bs[1][2] ) = 0x0303030303030303ULL; } else h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B ); int transform_8x8 = h->mb.b_transform_8x8; #define FILTER( dir, edge )\ do\ {\ deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\ FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma[dir] );\ if( CHROMA444 )\ {\ deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\ FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\ h->loopf.deblock_luma[dir] );\ deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\ FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\ h->loopf.deblock_luma[dir] );\ }\ } while( 0 ) if( !transform_8x8 ) FILTER( 0, 1 ); FILTER( 0, 2 ); if( !transform_8x8 ) FILTER( 0, 3 ); if( !transform_8x8 ) FILTER( 1, 1 ); FILTER( 1, 2 ); if( !transform_8x8 ) FILTER( 1, 3 ); #undef FILTER } #if HAVE_MMX #include "x86/deblock.h" #endif #if HAVE_ALTIVEC #include "ppc/deblock.h" #endif #if HAVE_ARMV6 #include "arm/deblock.h" #endif #if HAVE_AARCH64 #include "aarch64/deblock.h" #endif #if HAVE_MSA #include "mips/deblock.h" #endif #if HAVE_LSX #include "loongarch/deblock.h" #endif void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff ) { pf->deblock_luma[1] = deblock_v_luma_c; pf->deblock_luma[0] = deblock_h_luma_c; pf->deblock_chroma[1] = deblock_v_chroma_c; pf->deblock_h_chroma_420 = deblock_h_chroma_c; pf->deblock_h_chroma_422 = deblock_h_chroma_422_c; pf->deblock_luma_intra[1] = deblock_v_luma_intra_c; pf->deblock_luma_intra[0] = deblock_h_luma_intra_c; pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c; pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c; pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c; pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c; pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c; pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c; pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c; pf->deblock_strength = deblock_strength_c; #if HAVE_MMX if( cpu&X264_CPU_MMX2 ) { #if ARCH_X86 pf->deblock_luma[1] = x264_deblock_v_luma_mmx2; pf->deblock_luma[0] = x264_deblock_h_luma_mmx2; pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2; pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_mmx2; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_mmx2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2; pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; #endif #if !HIGH_BIT_DEPTH pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; #endif if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_sse2; pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_sse2; pf->deblock_luma[1] = x264_deblock_v_luma_sse2; pf->deblock_luma[0] = x264_deblock_h_luma_sse2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2; #if HIGH_BIT_DEPTH pf->deblock_chroma_420_intra_mbaff= x264_deblock_h_chroma_intra_mbaff_sse2; #endif } } if( cpu&X264_CPU_SSSE3 ) pf->deblock_strength = x264_deblock_strength_ssse3; if( cpu&X264_CPU_AVX ) { pf->deblock_strength = x264_deblock_strength_avx; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_avx; pf->deblock_luma[1] = x264_deblock_v_luma_avx; pf->deblock_luma[0] = x264_deblock_h_luma_avx; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_chroma[1] = x264_deblock_v_chroma_avx; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx; #if HIGH_BIT_DEPTH pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_avx; pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_avx; #endif } } if( cpu&X264_CPU_AVX2 ) { pf->deblock_strength = x264_deblock_strength_avx2; } if( cpu&X264_CPU_AVX512 ) { pf->deblock_strength = x264_deblock_strength_avx512; } } #endif #if !HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { pf->deblock_luma[1] = x264_deblock_v_luma_altivec; pf->deblock_luma[0] = x264_deblock_h_luma_altivec; } #endif // HAVE_ALTIVEC #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon; pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon; pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon; pf->deblock_strength = x264_deblock_strength_neon; } #if HAVE_SVE if ( cpu&X264_CPU_SVE ) { pf->deblock_chroma[1] = x264_deblock_v_chroma_sve; } #endif #endif #if HAVE_MSA if( cpu&X264_CPU_MSA ) { pf->deblock_luma[1] = x264_deblock_v_luma_msa; pf->deblock_luma[0] = x264_deblock_h_luma_msa; pf->deblock_chroma[1] = x264_deblock_v_chroma_msa; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa; pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa; pf->deblock_strength = x264_deblock_strength_msa; } #endif #if HAVE_LSX if( cpu&X264_CPU_LSX ) { pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lsx; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lsx; pf->deblock_strength = x264_deblock_strength_lsx; } if( cpu&X264_CPU_LASX ) { pf->deblock_luma[1] = x264_deblock_v_luma_lasx; pf->deblock_luma[0] = x264_deblock_h_luma_lasx; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lasx; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lasx; pf->deblock_strength = x264_deblock_strength_lasx; } #endif #endif // !HIGH_BIT_DEPTH /* These functions are equivalent, so don't duplicate them. */ pf->deblock_chroma_422_mbaff = pf->deblock_h_chroma_420; pf->deblock_chroma_422_intra_mbaff = pf->deblock_h_chroma_420_intra; } x264-master/common/frame.c000066400000000000000000001001741502133446700156100ustar00rootroot00000000000000/***************************************************************************** * frame.c: frame handling ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" static int align_stride( int x, int align, int disalign ) { x = ALIGN( x, align ); if( !(x&(disalign-1)) ) x += align; return x; } static int align_plane_size( int x, int disalign ) { if( !(x&(disalign-1)) ) x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL; return x; } static int frame_internal_csp( int external_csp ) { int csp = external_csp & X264_CSP_MASK; if( csp == X264_CSP_I400 ) return X264_CSP_I400; if( csp >= X264_CSP_I420 && csp < X264_CSP_I422 ) return X264_CSP_NV12; if( csp >= X264_CSP_I422 && csp < X264_CSP_I444 ) return X264_CSP_NV16; if( csp >= X264_CSP_I444 && csp <= X264_CSP_RGB ) return X264_CSP_I444; return X264_CSP_NONE; } static x264_frame_t *frame_new( x264_t *h, int b_fdec ) { x264_frame_t *frame; int i_csp = frame_internal_csp( h->param.i_csp ); int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines, luma_plane_count; int i_padv = PADV << PARAM_INTERLACED; int align = NATIVE_ALIGN / SIZEOF_PIXEL; #if ARCH_X86 || ARCH_X86_64 if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 ) align = 64 / SIZEOF_PIXEL; else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX ) align = 32 / SIZEOF_PIXEL; else align = 16 / SIZEOF_PIXEL; #endif #if ARCH_PPC int disalign = (1<<9) / SIZEOF_PIXEL; #else int disalign = (1<<10) / SIZEOF_PIXEL; #endif CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) ); PREALLOC_INIT /* allocate frame data (+64 for extra data for me) */ i_width = h->mb.i_mb_width*16; i_lines = h->mb.i_mb_height*16; i_stride = align_stride( i_width + PADH2, align, disalign ); if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { luma_plane_count = 1; frame->i_plane = 2; for( int i = 0; i < 2; i++ ) { frame->i_width[i] = i_width >> i; frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12); frame->i_stride[i] = i_stride; } } else if( i_csp == X264_CSP_I444 ) { luma_plane_count = 3; frame->i_plane = 3; for( int i = 0; i < 3; i++ ) { frame->i_width[i] = i_width; frame->i_lines[i] = i_lines; frame->i_stride[i] = i_stride; } } else if( i_csp == X264_CSP_I400 ) { luma_plane_count = 1; frame->i_plane = 1; frame->i_width[0] = i_width; frame->i_lines[0] = i_lines; frame->i_stride[0] = i_stride; } else goto fail; frame->i_csp = i_csp; frame->i_width_lowres = frame->i_width[0]/2; frame->i_lines_lowres = frame->i_lines[0]/2; frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 ); for( int i = 0; i < h->param.i_bframe + 2; i++ ) for( int j = 0; j < h->param.i_bframe + 2; j++ ) PREALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); frame->i_poc = -1; frame->i_type = X264_TYPE_AUTO; frame->i_qpplus1 = X264_QP_AUTO; frame->i_pts = -1; frame->i_frame = -1; frame->i_frame_num = -1; frame->i_lines_completed = -1; frame->b_fdec = b_fdec; frame->i_pic_struct = PIC_STRUCT_AUTO; frame->i_field_cnt = -1; frame->i_duration = frame->i_cpb_duration = frame->i_dpb_output_delay = frame->i_cpb_delay = 0; frame->i_coded_fields_lookahead = frame->i_cpb_delay_lookahead = -1; frame->orig = frame; if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv)); PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL ); if( PARAM_INTERLACED ) PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL ); } /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ for( int p = 0; p < luma_plane_count; p++ ) { int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign ); if( h->param.analyse.i_subpel_refine && b_fdec ) luma_plane_size *= 4; /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */ PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL ); if( PARAM_INTERLACED ) PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL ); } frame->b_duplicate = 0; if( b_fdec ) /* fdec frame */ { PREALLOC( frame->mb_type, i_mb_count * sizeof(int8_t) ); PREALLOC( frame->mb_partition, i_mb_count * sizeof(uint8_t) ); PREALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); PREALLOC( frame->mv16x16, 2*(i_mb_count+1) * sizeof(int16_t) ); PREALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); if( h->param.i_bframe ) { PREALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) ); PREALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) ); } else { frame->mv[1] = NULL; frame->ref[1] = NULL; } PREALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); PREALLOC( frame->f_row_qp, i_lines/16 * sizeof(float) ); PREALLOC( frame->f_row_qscale, i_lines/16 * sizeof(float) ); if( h->param.analyse.i_me_method >= X264_ME_ESA ) PREALLOC( frame->buffer[3], frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); if( PARAM_INTERLACED ) PREALLOC( frame->field, i_mb_count * sizeof(uint8_t) ); if( h->param.analyse.b_mb_info ) PREALLOC( frame->effective_qp, i_mb_count * sizeof(uint8_t) ); } else /* fenc frame */ { if( h->frames.b_have_lowres ) { int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign ); PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL ); for( int j = 0; j <= !!h->param.i_bframe; j++ ) for( int i = 0; i <= h->param.i_bframe; i++ ) { PREALLOC( frame->lowres_mvs[j][i], 2*i_mb_count*sizeof(int16_t) ); PREALLOC( frame->lowres_mv_costs[j][i], i_mb_count*sizeof(int) ); } PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) ); } if( h->param.rc.i_aq_mode ) { PREALLOC( frame->f_qp_offset, i_mb_count * sizeof(float) ); PREALLOC( frame->f_qp_offset_aq, i_mb_count * sizeof(float) ); if( h->frames.b_have_lowres ) PREALLOC( frame->i_inv_qscale_factor, i_mb_count * sizeof(uint16_t) ); } /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */ if( h->frames.b_have_lowres ) prealloc_size += NATIVE_ALIGN; } PREALLOC_END( frame->base ); if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN; if( PARAM_INTERLACED ) frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN; } for( int p = 0; p < luma_plane_count; p++ ) { int64_t luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign ); if( h->param.analyse.i_subpel_refine && b_fdec ) { for( int i = 0; i < 4; i++ ) { frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN; if( PARAM_INTERLACED ) frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN; } frame->plane[p] = frame->filtered[p][0]; frame->plane_fld[p] = frame->filtered_fld[p][0]; } else { frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN; if( PARAM_INTERLACED ) frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN; } } if( b_fdec ) { M32( frame->mv16x16[0] ) = 0; frame->mv16x16++; if( h->param.analyse.i_me_method >= X264_ME_ESA ) frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN; } else { if( h->frames.b_have_lowres ) { int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign ); for( int i = 0; i < 4; i++ ) frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size; for( int j = 0; j <= !!h->param.i_bframe; j++ ) for( int i = 0; i <= h->param.i_bframe; i++ ) memset( frame->lowres_mvs[j][i], 0, 2*i_mb_count*sizeof(int16_t) ); frame->i_intra_cost = frame->lowres_costs[0][0]; memset( frame->i_intra_cost, -1, i_mb_count * sizeof(uint16_t) ); if( h->param.rc.i_aq_mode ) /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */ memset( frame->i_inv_qscale_factor, 0, i_mb_count * sizeof(uint16_t) ); } } if( x264_pthread_mutex_init( &frame->mutex, NULL ) ) goto fail; if( x264_pthread_cond_init( &frame->cv, NULL ) ) goto fail; #if HAVE_OPENCL frame->opencl.ocl = h->opencl.ocl; #endif return frame; fail: x264_free( frame ); return NULL; } void x264_frame_delete( x264_frame_t *frame ) { /* Duplicate frames are blank copies of real frames (including pointers), * so freeing those pointers would cause a double free later. */ if( !frame->b_duplicate ) { x264_free( frame->base ); if( frame->param && frame->param->param_free ) { x264_param_cleanup( frame->param ); frame->param->param_free( frame->param ); } if( frame->mb_info_free ) frame->mb_info_free( frame->mb_info ); if( frame->extra_sei.sei_free ) { for( int i = 0; i < frame->extra_sei.num_payloads; i++ ) frame->extra_sei.sei_free( frame->extra_sei.payloads[i].payload ); frame->extra_sei.sei_free( frame->extra_sei.payloads ); } x264_pthread_mutex_destroy( &frame->mutex ); x264_pthread_cond_destroy( &frame->cv ); #if HAVE_OPENCL x264_opencl_frame_delete( frame ); #endif } x264_free( frame ); } static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift ) { int width = h->param.i_width >> xshift; int height = h->param.i_height >> yshift; *pix = src->img.plane[plane]; *stride = src->img.i_stride[plane]; if( src->img.i_csp & X264_CSP_VFLIP ) { *pix += (height-1) * *stride; *stride = -*stride; } if( width > abs(*stride) ) { x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride ); return -1; } return 0; } #define get_plane_ptr(...) do { if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; } while( 0 ) int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) { int i_csp = src->img.i_csp & X264_CSP_MASK; if( dst->i_csp != frame_internal_csp( i_csp ) ) { x264_log( h, X264_LOG_ERROR, "Invalid input colorspace\n" ); return -1; } #if HIGH_BIT_DEPTH if( !(src->img.i_csp & X264_CSP_HIGH_DEPTH) ) { x264_log( h, X264_LOG_ERROR, "This build of x264 requires high depth input. Rebuild to support 8-bit input.\n" ); return -1; } #else if( src->img.i_csp & X264_CSP_HIGH_DEPTH ) { x264_log( h, X264_LOG_ERROR, "This build of x264 requires 8-bit input. Rebuild to support high depth input.\n" ); return -1; } #endif if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 ) { x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" ); return -1; } if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME ) { x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input ); dst->i_forced_type = X264_TYPE_AUTO; } else dst->i_forced_type = src->i_type; dst->i_type = dst->i_forced_type; dst->i_qpplus1 = src->i_qpplus1; dst->i_pts = dst->i_reordered_pts = src->i_pts; dst->param = src->param; dst->i_pic_struct = src->i_pic_struct; dst->extra_sei = src->extra_sei; dst->opaque = src->opaque; dst->mb_info = h->param.analyse.b_mb_info ? src->prop.mb_info : NULL; dst->mb_info_free = h->param.analyse.b_mb_info ? src->prop.mb_info_free : NULL; uint8_t *pix[3]; int stride[3]; if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY ) { int p = i_csp == X264_CSP_UYVY; h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1], (pixel*)src->img.plane[0], src->img.i_stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height ); } else if( i_csp == X264_CSP_V210 ) { stride[0] = src->img.i_stride[0]; pix[0] = src->img.plane[0]; h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0], dst->plane[1], dst->i_stride[1], (uint32_t *)pix[0], stride[0]/(int)sizeof(uint32_t), h->param.i_width, h->param.i_height ); } else if( i_csp >= X264_CSP_BGR ) { stride[0] = src->img.i_stride[0]; pix[0] = src->img.plane[0]; if( src->img.i_csp & X264_CSP_VFLIP ) { pix[0] += (h->param.i_height-1) * stride[0]; stride[0] = -stride[0]; } int b = i_csp==X264_CSP_RGB; h->mc.plane_copy_deinterleave_rgb( dst->plane[1+b], dst->i_stride[1+b], dst->plane[0], dst->i_stride[0], dst->plane[2-b], dst->i_stride[2-b], (pixel*)pix[0], stride[0]/SIZEOF_PIXEL, i_csp==X264_CSP_BGRA ? 4 : 3, h->param.i_width, h->param.i_height ); } else { int v_shift = CHROMA_V_SHIFT; get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 ); h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0], stride[0]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height ); if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift ); h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height>>v_shift ); } else if( i_csp == X264_CSP_NV21 ) { get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift ); h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/SIZEOF_PIXEL, h->param.i_width>>1, h->param.i_height>>v_shift ); } else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 ) { int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16; get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift ); get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift ); h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/SIZEOF_PIXEL, (pixel*)pix[2], stride[2]/SIZEOF_PIXEL, h->param.i_width>>1, h->param.i_height>>v_shift ); } else if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 ) { get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I444 ? 1 : 2, 0, 0 ); get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I444 ? 2 : 1, 0, 0 ); h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height ); h->mc.plane_copy( dst->plane[2], dst->i_stride[2], (pixel*)pix[2], stride[2]/SIZEOF_PIXEL, h->param.i_width, h->param.i_height ); } } return 0; } static ALWAYS_INLINE void pixel_memset( pixel *dst, pixel *src, int len, int size ) { uint8_t *dstp = (uint8_t*)dst; uint32_t v1 = *src; uint32_t v2 = size == 1 ? v1 + (v1 << 8) : M16( src ); uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src ); int i = 0; len *= size; /* Align the input pointer if it isn't already */ if( (intptr_t)dstp & (WORD_SIZE - 1) ) { if( size <= 2 && ((intptr_t)dstp & 3) ) { if( size == 1 && ((intptr_t)dstp & 1) ) dstp[i++] = v1; if( (intptr_t)dstp & 2 ) { M16( dstp+i ) = v2; i += 2; } } if( WORD_SIZE == 8 && (intptr_t)dstp & 4 ) { M32( dstp+i ) = v4; i += 4; } } /* Main copy loop */ if( WORD_SIZE == 8 ) { uint64_t v8 = v4 + ((uint64_t)v4<<32); for( ; i < len - 7; i+=8 ) M64( dstp+i ) = v8; } for( ; i < len - 3; i+=4 ) M32( dstp+i ) = v4; /* Finish up the last few bytes */ if( size <= 2 ) { if( i < len - 1 ) { M16( dstp+i ) = v2; i += 2; } if( size == 1 && i != len ) dstp[i] = v1; } } static ALWAYS_INLINE void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma ) { #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride ) for( int y = 0; y < i_height; y++ ) { /* left band */ pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, SIZEOF_PIXEL<>b_chroma, SIZEOF_PIXEL<mb.i_mb_height - (1 << SLICE_MBAFF); int b_start = mb_y == h->i_threadslice_start; int b_end = mb_y == h->i_threadslice_end - (1 << SLICE_MBAFF); if( mb_y & SLICE_MBAFF ) return; for( int i = 0; i < frame->i_plane; i++ ) { int h_shift = i && CHROMA_H_SHIFT; int v_shift = i && CHROMA_V_SHIFT; int stride = frame->i_stride[i]; int width = 16*h->mb.i_mb_width; int height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift; int padh = PADH; int padv = PADV >> v_shift; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb if( b_end && !b_start ) height += 4 >> (v_shift + SLICE_MBAFF); pixel *pix; int starty = 16*mb_y - 4*!b_start; if( SLICE_MBAFF ) { // border samples for each field are extended separately pix = frame->plane_fld[i] + (starty*stride >> v_shift); plane_expand_border( pix, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift ); plane_expand_border( pix+stride, stride*2, width, height, padh, padv, pad_top, pad_bot, h_shift ); height = (pad_bot ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift; if( b_end && !b_start ) height += 4 >> v_shift; pix = frame->plane[i] + (starty*stride >> v_shift); plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift ); } else { pix = frame->plane[i] + (starty*stride >> v_shift); plane_expand_border( pix, stride, width, height, padh, padv, pad_top, pad_bot, h_shift ); } } } void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) { /* during filtering, 8 extra pixels were filtered on each edge, * but up to 3 of the horizontal ones may be wrong. we want to expand border from the last filtered pixel */ int b_start = !mb_y; int width = 16*h->mb.i_mb_width + 8; int height = b_end ? (16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF) + 16 : 16; int padh = PADH - 4; int padv = PADV - 8; for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) for( int i = 1; i < 4; i++ ) { int stride = frame->i_stride[p]; // buffer: 8 luma, to match the hpel filter pixel *pix; if( SLICE_MBAFF ) { pix = frame->filtered_fld[p][i] + (16*mb_y - 16) * stride - 4; plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 ); plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 ); } pix = frame->filtered[p][i] + (16*mb_y - 8) * stride - 4; plane_expand_border( pix, stride, width, height << SLICE_MBAFF, padh, padv, b_start, b_end, 0 ); } } void x264_frame_expand_border_lowres( x264_frame_t *frame ) { for( int i = 0; i < 4; i++ ) plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 ); } void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane ) { int v_shift = CHROMA_V_SHIFT; plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift, PADH, PADV>>v_shift, 1, 1, CHROMA_H_SHIFT ); } void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) { for( int i = 0; i < frame->i_plane; i++ ) { int i_width = h->param.i_width; int h_shift = i && CHROMA_H_SHIFT; int v_shift = i && CHROMA_V_SHIFT; int i_height = h->param.i_height >> v_shift; int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width); int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift; if( i_padx ) { for( int y = 0; y < i_height; y++ ) pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift], i_padx>>h_shift, SIZEOF_PIXEL<plane[i][y*frame->i_stride[i]], &frame->plane[i][(i_height-(~y&PARAM_INTERLACED)-1)*frame->i_stride[i]], (i_width + i_padx) * SIZEOF_PIXEL ); } } } void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y ) { for( int i = 0; i < h->fenc->i_plane; i++ ) { int v_shift = i && CHROMA_V_SHIFT; int stride = h->fenc->i_stride[i]; int height = h->param.i_height >> v_shift; int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift; pixel *fenc = h->fenc->plane[i] + 16*mb_x; for( int y = height; y < height + pady; y++ ) memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*SIZEOF_PIXEL ); } } /* threading */ void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ) { x264_pthread_mutex_lock( &frame->mutex ); frame->i_lines_completed = i_lines_completed; x264_pthread_cond_broadcast( &frame->cv ); x264_pthread_mutex_unlock( &frame->mutex ); } int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed ) { int completed; x264_pthread_mutex_lock( &frame->mutex ); while( (completed = frame->i_lines_completed) < i_lines_completed && i_lines_completed >= 0 ) x264_pthread_cond_wait( &frame->cv, &frame->mutex ); x264_pthread_mutex_unlock( &frame->mutex ); return completed; } void x264_threadslice_cond_broadcast( x264_t *h, int pass ) { x264_pthread_mutex_lock( &h->mutex ); h->i_threadslice_pass = pass; if( pass > 0 ) x264_pthread_cond_broadcast( &h->cv ); x264_pthread_mutex_unlock( &h->mutex ); } void x264_threadslice_cond_wait( x264_t *h, int pass ) { x264_pthread_mutex_lock( &h->mutex ); while( h->i_threadslice_pass < pass ) x264_pthread_cond_wait( &h->cv, &h->mutex ); x264_pthread_mutex_unlock( &h->mutex ); } int x264_frame_new_slice( x264_t *h, x264_frame_t *frame ) { if( h->param.i_slice_count_max ) { int slice_count; if( h->param.b_sliced_threads ) slice_count = x264_pthread_fetch_and_add( &frame->i_slice_count, 1, &frame->mutex ); else slice_count = frame->i_slice_count++; if( slice_count >= h->param.i_slice_count_max ) return -1; } return 0; } /* list operators */ void x264_frame_push( x264_frame_t **list, x264_frame_t *frame ) { int i = 0; while( list[i] ) i++; list[i] = frame; } x264_frame_t *x264_frame_pop( x264_frame_t **list ) { x264_frame_t *frame; int i = 0; assert( list[0] ); while( list[i+1] ) i++; frame = list[i]; list[i] = NULL; return frame; } void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame ) { int i = 0; while( list[i] ) i++; while( i-- ) list[i+1] = list[i]; list[0] = frame; } x264_frame_t *x264_frame_shift( x264_frame_t **list ) { x264_frame_t *frame = list[0]; int i; for( i = 0; list[i]; i++ ) list[i] = list[i+1]; assert(frame); return frame; } void x264_frame_push_unused( x264_t *h, x264_frame_t *frame ) { assert( frame->i_reference_count > 0 ); frame->i_reference_count--; if( frame->i_reference_count == 0 ) x264_frame_push( h->frames.unused[frame->b_fdec], frame ); } x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ) { x264_frame_t *frame; if( h->frames.unused[b_fdec][0] ) frame = x264_frame_pop( h->frames.unused[b_fdec] ); else frame = frame_new( h, b_fdec ); if( !frame ) return NULL; frame->b_last_minigop_bframe = 0; frame->i_reference_count = 1; frame->b_intra_calculated = 0; frame->b_scenecut = 1; frame->b_keyframe = 0; frame->b_corrupt = 0; frame->i_slice_count = h->param.b_sliced_threads ? h->param.i_threads : 1; memset( frame->weight, 0, sizeof(frame->weight) ); memset( frame->f_weighted_cost_delta, 0, sizeof(frame->f_weighted_cost_delta) ); return frame; } void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame ) { assert( frame->i_reference_count > 0 ); frame->i_reference_count--; if( frame->i_reference_count == 0 ) x264_frame_push( h->frames.blank_unused, frame ); } x264_frame_t *x264_frame_pop_blank_unused( x264_t *h ) { x264_frame_t *frame; if( h->frames.blank_unused[0] ) frame = x264_frame_pop( h->frames.blank_unused ); else frame = x264_malloc( sizeof(x264_frame_t) ); if( !frame ) return NULL; frame->b_duplicate = 1; frame->i_reference_count = 1; return frame; } void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, int i_width, int i_height, x264_weight_t *w ) { /* Weight horizontal strips of height 16. This was found to be the optimal height * in terms of the cache loads. */ while( i_height > 0 ) { int x; for( x = 0; x < i_width-8; x += 16 ) w->weightfn[16>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) ); if( x < i_width ) w->weightfn[ 8>>2]( dst+x, i_dst_stride, src+x, i_src_stride, w, X264_MIN( i_height, 16 ) ); i_height -= 16; dst += 16 * i_dst_stride; src += 16 * i_src_stride; } } void x264_frame_delete_list( x264_frame_t **list ) { int i = 0; if( !list ) return; while( list[i] ) x264_frame_delete( list[i++] ); x264_free( list ); } int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int max_size ) { if( max_size < 0 ) return -1; slist->i_max_size = max_size; slist->i_size = 0; CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) ); if( x264_pthread_mutex_init( &slist->mutex, NULL ) || x264_pthread_cond_init( &slist->cv_fill, NULL ) || x264_pthread_cond_init( &slist->cv_empty, NULL ) ) return -1; return 0; fail: return -1; } void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist ) { x264_pthread_mutex_destroy( &slist->mutex ); x264_pthread_cond_destroy( &slist->cv_fill ); x264_pthread_cond_destroy( &slist->cv_empty ); x264_frame_delete_list( slist->list ); } void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame ) { x264_pthread_mutex_lock( &slist->mutex ); while( slist->i_size == slist->i_max_size ) x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex ); slist->list[ slist->i_size++ ] = frame; x264_pthread_mutex_unlock( &slist->mutex ); x264_pthread_cond_broadcast( &slist->cv_fill ); } x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist ) { x264_frame_t *frame; x264_pthread_mutex_lock( &slist->mutex ); while( !slist->i_size ) x264_pthread_cond_wait( &slist->cv_fill, &slist->mutex ); frame = slist->list[ --slist->i_size ]; slist->list[ slist->i_size ] = NULL; x264_pthread_cond_broadcast( &slist->cv_empty ); x264_pthread_mutex_unlock( &slist->mutex ); return frame; } x264-master/common/frame.h000066400000000000000000000313331502133446700156150ustar00rootroot00000000000000/***************************************************************************** * frame.h: frame handling ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_FRAME_H #define X264_FRAME_H /* number of pixels past the edge of the frame, for motion estimation/compensation */ #define PADH 32 #define PADV 32 #define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL ) #define PADH2 (PADH_ALIGN + PADH) typedef struct x264_frame { /* */ uint8_t *base; /* Base pointer for all malloced data in this frame. */ int i_poc; int i_delta_poc[2]; int i_type; int i_forced_type; int i_qpplus1; int64_t i_pts; int64_t i_dts; int64_t i_reordered_pts; int64_t i_duration; /* in SPS time_scale units (i.e 2 * timebase units) used for vfr */ float f_duration; /* in seconds */ int64_t i_cpb_duration; int64_t i_cpb_delay; /* in SPS time_scale units (i.e 2 * timebase units) */ int64_t i_dpb_output_delay; x264_param_t *param; int i_frame; /* Presentation frame number */ int i_coded; /* Coded frame number */ int64_t i_field_cnt; /* Presentation field count */ int i_frame_num; /* 7.4.3 frame_num */ int b_kept_as_ref; int i_pic_struct; int b_keyframe; uint8_t b_fdec; uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */ uint8_t i_bframes; /* number of bframes following this nonb in coded order */ float f_qp_avg_rc; /* QPs as decided by ratecontrol */ float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */ float f_crf_avg; /* Average effective CRF for this frame */ int i_poc_l0ref0; /* poc of first refframe in L0, used to check if direct temporal is possible */ /* YUV buffer */ int i_csp; /* Internal csp */ int i_plane; int i_stride[3]; int i_width[3]; int i_lines[3]; int i_stride_lowres; int i_width_lowres; int i_lines_lowres; pixel *plane[3]; pixel *plane_fld[3]; pixel *filtered[3][4]; /* plane[0], H, V, HV */ pixel *filtered_fld[3][4]; pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */ uint16_t *integral; /* for unrestricted mv we allocate more data than needed * allocated data are stored in buffer */ pixel *buffer[4]; pixel *buffer_fld[4]; pixel *buffer_lowres; x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */ pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */ int b_duplicate; struct x264_frame *orig; /* motion data */ int8_t *mb_type; uint8_t *mb_partition; int16_t (*mv[2])[2]; int16_t (*mv16x16)[2]; int16_t (*lowres_mvs[2][X264_BFRAME_MAX+1])[2]; uint8_t *field; uint8_t *effective_qp; /* Stored as (lists_used << LOWRES_COST_SHIFT) + (cost). * Doesn't need special addressing for intra cost because * lists_used is guaranteed to be zero in that cast. */ uint16_t (*lowres_costs[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]); #define LOWRES_COST_MASK ((1<<14)-1) #define LOWRES_COST_SHIFT 14 int *lowres_mv_costs[2][X264_BFRAME_MAX+1]; int8_t *ref[2]; int i_ref[2]; int ref_poc[2][X264_REF_MAX]; int16_t inv_ref_poc[2]; // inverse values of ref0 poc to avoid divisions in temporal MV prediction /* for adaptive B-frame decision. * contains the SATD cost of the lowres frame encoded in various modes * FIXME: how big an array do we need? */ int i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int i_satd; // the i_cost_est of the selected frametype int i_intra_mbs[X264_BFRAME_MAX+2]; int *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2]; int *i_row_satd; int *i_row_bits; float *f_row_qp; float *f_row_qscale; float *f_qp_offset; float *f_qp_offset_aq; int b_intra_calculated; uint16_t *i_intra_cost; uint16_t *i_propagate_cost; uint16_t *i_inv_qscale_factor; int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */ float f_weighted_cost_delta[X264_BFRAME_MAX+2]; uint32_t i_pixel_sum[3]; uint64_t i_pixel_ssd[3]; /* hrd */ x264_hrd_t hrd_timing; /* vbv */ uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1]; int i_planned_satd[X264_LOOKAHEAD_MAX+1]; double f_planned_cpb_duration[X264_LOOKAHEAD_MAX+1]; int64_t i_coded_fields_lookahead; int64_t i_cpb_delay_lookahead; /* threading */ int i_lines_completed; /* in pixels */ int i_lines_weighted; /* FIXME: this only supports weighting of one reference frame */ int i_reference_count; /* number of threads using this frame (not necessarily the number of pointers) */ x264_pthread_mutex_t mutex; x264_pthread_cond_t cv; int i_slice_count; /* Atomically written to/read from with slice threads */ /* periodic intra refresh */ float f_pir_position; int i_pir_start_col; int i_pir_end_col; int i_frames_since_pir; /* interactive encoder control */ int b_corrupt; /* user sei */ x264_sei_t extra_sei; /* user data */ void *opaque; /* user frame properties */ uint8_t *mb_info; void (*mb_info_free)( void* ); #if HAVE_OPENCL x264_frame_opencl_t opencl; #endif } x264_frame_t; /* synchronized frame list */ typedef struct { x264_frame_t **list; int i_max_size; int i_size; x264_pthread_mutex_t mutex; x264_pthread_cond_t cv_fill; /* event signaling that the list became fuller */ x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */ } x264_sync_frame_list_t; typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta ); typedef struct { x264_deblock_inter_t deblock_luma[2]; x264_deblock_inter_t deblock_chroma[2]; x264_deblock_inter_t deblock_h_chroma_420; x264_deblock_inter_t deblock_h_chroma_422; x264_deblock_intra_t deblock_luma_intra[2]; x264_deblock_intra_t deblock_chroma_intra[2]; x264_deblock_intra_t deblock_h_chroma_420_intra; x264_deblock_intra_t deblock_h_chroma_422_intra; x264_deblock_inter_t deblock_luma_mbaff; x264_deblock_inter_t deblock_chroma_mbaff; x264_deblock_inter_t deblock_chroma_420_mbaff; x264_deblock_inter_t deblock_chroma_422_mbaff; x264_deblock_intra_t deblock_luma_intra_mbaff; x264_deblock_intra_t deblock_chroma_intra_mbaff; x264_deblock_intra_t deblock_chroma_420_intra_mbaff; x264_deblock_intra_t deblock_chroma_422_intra_mbaff; void (*deblock_strength)( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); } x264_deblock_function_t; #define x264_frame_delete x264_template(frame_delete) void x264_frame_delete( x264_frame_t *frame ); #define x264_frame_copy_picture x264_template(frame_copy_picture) int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ); #define x264_frame_expand_border x264_template(frame_expand_border) void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y ); #define x264_frame_expand_border_filtered x264_template(frame_expand_border_filtered) void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ); #define x264_frame_expand_border_lowres x264_template(frame_expand_border_lowres) void x264_frame_expand_border_lowres( x264_frame_t *frame ); #define x264_frame_expand_border_chroma x264_template(frame_expand_border_chroma) void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane ); #define x264_frame_expand_border_mod16 x264_template(frame_expand_border_mod16) void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ); #define x264_expand_border_mbpair x264_template(expand_border_mbpair) void x264_expand_border_mbpair( x264_t *h, int mb_x, int mb_y ); #define x264_frame_deblock_row x264_template(frame_deblock_row) void x264_frame_deblock_row( x264_t *h, int mb_y ); #define x264_macroblock_deblock x264_template(macroblock_deblock) void x264_macroblock_deblock( x264_t *h ); #define x264_frame_filter x264_template(frame_filter) void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ); #define x264_frame_init_lowres x264_template(frame_init_lowres) void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame ); #define x264_deblock_init x264_template(deblock_init) void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff ); #define x264_frame_cond_broadcast x264_template(frame_cond_broadcast) void x264_frame_cond_broadcast( x264_frame_t *frame, int i_lines_completed ); #define x264_frame_cond_wait x264_template(frame_cond_wait) int x264_frame_cond_wait( x264_frame_t *frame, int i_lines_completed ); #define x264_frame_new_slice x264_template(frame_new_slice) int x264_frame_new_slice( x264_t *h, x264_frame_t *frame ); #define x264_threadslice_cond_broadcast x264_template(threadslice_cond_broadcast) void x264_threadslice_cond_broadcast( x264_t *h, int pass ); #define x264_threadslice_cond_wait x264_template(threadslice_cond_wait) void x264_threadslice_cond_wait( x264_t *h, int pass ); #define x264_frame_push x264_template(frame_push) X264_API void x264_frame_push( x264_frame_t **list, x264_frame_t *frame ); #define x264_frame_pop x264_template(frame_pop) X264_API x264_frame_t *x264_frame_pop( x264_frame_t **list ); #define x264_frame_unshift x264_template(frame_unshift) X264_API void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame ); #define x264_frame_shift x264_template(frame_shift) X264_API x264_frame_t *x264_frame_shift( x264_frame_t **list ); #define x264_frame_push_unused x264_template(frame_push_unused) void x264_frame_push_unused( x264_t *h, x264_frame_t *frame ); #define x264_frame_push_blank_unused x264_template(frame_push_blank_unused) void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame ); #define x264_frame_pop_blank_unused x264_template(frame_pop_blank_unused) x264_frame_t *x264_frame_pop_blank_unused( x264_t *h ); #define x264_weight_scale_plane x264_template(weight_scale_plane) void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, int i_width, int i_height, x264_weight_t *w ); #define x264_frame_pop_unused x264_template(frame_pop_unused) x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ); #define x264_frame_delete_list x264_template(frame_delete_list) void x264_frame_delete_list( x264_frame_t **list ); #define x264_sync_frame_list_init x264_template(sync_frame_list_init) int x264_sync_frame_list_init( x264_sync_frame_list_t *slist, int nelem ); #define x264_sync_frame_list_delete x264_template(sync_frame_list_delete) void x264_sync_frame_list_delete( x264_sync_frame_list_t *slist ); #define x264_sync_frame_list_push x264_template(sync_frame_list_push) void x264_sync_frame_list_push( x264_sync_frame_list_t *slist, x264_frame_t *frame ); #define x264_sync_frame_list_pop x264_template(sync_frame_list_pop) x264_frame_t *x264_sync_frame_list_pop( x264_sync_frame_list_t *slist ); #endif x264-master/common/loongarch/000077500000000000000000000000001502133446700163235ustar00rootroot00000000000000x264-master/common/loongarch/dct-a.S000066400000000000000000002041541502133446700174450ustar00rootroot00000000000000/***************************************************************************** * dct-a.S: LoongArch transform and zigzag ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Peng Zhou * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" const hsub_mul .rept 16 .byte 1, -1 .endr endconst const last64_shuf .int 0, 4, 1, 5, 2, 6, 3, 7 endconst const zigzag_scan4 .short 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15 endconst .macro LOAD_DIFF8x4_LASX s1, s2, s3, s4, s5, s6, s7, s8, s9, s10 fld.d $f\s1, a1, FENC_STRIDE * \s7 fld.d $f\s2, a1, FENC_STRIDE * \s8 fld.d $f\s5, a1, FENC_STRIDE * \s9 fld.d $f\s6, a1, FENC_STRIDE * \s10 xvinsve0.d $xr\s1, $xr\s5, 2 xvinsve0.d $xr\s2, $xr\s6, 2 fld.d $f\s3, a2, FDEC_STRIDE * \s7 fld.d $f\s4, a2, FDEC_STRIDE * \s8 fld.d $f\s5, a2, FDEC_STRIDE * \s9 fld.d $f\s6, a2, FDEC_STRIDE * \s10 xvinsve0.d $xr\s3, $xr\s5, 2 xvinsve0.d $xr\s4, $xr\s6, 2 xvilvl.b $xr\s1, xr8, $xr\s1 xvilvl.b $xr\s2, xr8, $xr\s2 xvilvl.b $xr\s3, xr8, $xr\s3 xvilvl.b $xr\s4, xr8, $xr\s4 xvsub.h $xr\s1, $xr\s1, $xr\s3 xvsub.h $xr\s2, $xr\s2, $xr\s4 .endm .macro DCT4_1D_LASX s0, s1, s2, s3, s4 xvadd.h \s4, \s3, \s0 xvsub.h \s0, \s0, \s3 xvadd.h \s3, \s2, \s1 xvsub.h \s1, \s1, \s2 xvadd.h \s2, \s3, \s4 xvsub.h \s4, \s4, \s3 xvsub.h \s3, \s0, \s1 xvsub.h \s3, \s3, \s1 xvadd.h \s0, \s0, \s0 xvadd.h \s0, \s0, \s1 .endm .macro LSX_SUMSUB_H sum, sub, a, b vadd.h \sum, \a, \b vsub.h \sub, \a, \b .endm .macro DCT4_1D_LSX s0, s1, s2, s3, s4, s5, s6, s7 LSX_SUMSUB_H \s1, \s6, \s5, \s6 LSX_SUMSUB_H \s3, \s7, \s4, \s7 vadd.h \s0, \s3, \s1 vadd.h \s4, \s7, \s7 vadd.h \s5, \s6, \s6 vsub.h \s2, \s3, \s1 vadd.h \s1, \s4, \s6 vsub.h \s3, \s7, \s5 .endm .macro SUB8x8_DCT_CORE_LASX LOAD_DIFF8x4_LASX 0, 1, 2, 3, 4, 5, 0, 1, 4, 5 LOAD_DIFF8x4_LASX 2, 3, 4, 5, 6, 7, 2, 3, 6, 7 DCT4_1D_LASX xr0, xr1, xr2, xr3, xr4 LASX_TRANSPOSE2x4x4_H xr0, xr2, xr3, xr4, xr0, xr1, \ xr2, xr3, xr10, xr12, xr13 DCT4_1D_LASX xr2, xr0, xr3, xr1, xr4 xvilvh.d xr0, xr2, xr3 /* 6, 2 */ xvilvl.d xr3, xr2, xr3 /* 4, 0 */ xvilvh.d xr2, xr1, xr4 /* 7, 3 */ xvilvl.d xr4, xr1, xr4 /* 5, 1 */ xvor.v xr1, xr3, xr3 xvpermi.q xr3, xr4, 0x02 /* 1, 0 */ xvor.v xr5, xr0, xr0 xvpermi.q xr0, xr2, 0x02 /* 3, 2 */ xvpermi.q xr1, xr4, 0x13 /* 4, 5 */ xvpermi.q xr5, xr2, 0x13 /* 7, 6 */ xvst xr3, a0, 0 xvst xr0, a0, 16 * 2 xvst xr1, a0, 16 * 4 xvst xr5, a0, 16 * 6 .endm .macro SUB8x8_DCT_CORE_LSX fld.d f0, a1, FENC_STRIDE * 0 fld.d f1, a1, FENC_STRIDE * 1 fld.d f4, a1, FENC_STRIDE * 4 fld.d f5, a1, FENC_STRIDE * 5 fld.d f2, a2, FDEC_STRIDE * 0 fld.d f3, a2, FDEC_STRIDE * 1 fld.d f6, a2, FDEC_STRIDE * 4 fld.d f7, a2, FDEC_STRIDE * 5 vilvl.b vr0, vr8, vr0 vilvl.b vr1, vr8, vr1 vilvl.b vr4, vr8, vr4 vilvl.b vr5, vr8, vr5 vilvl.b vr2, vr8, vr2 vilvl.b vr3, vr8, vr3 vilvl.b vr6, vr8, vr6 vilvl.b vr7, vr8, vr7 vsub.h vr0, vr0, vr2 vsub.h vr4, vr4, vr6 vsub.h vr1, vr1, vr3 vsub.h vr5, vr5, vr7 fld.d f2, a1, FENC_STRIDE * 2 fld.d f3, a1, FENC_STRIDE * 3 fld.d f6, a1, FENC_STRIDE * 6 fld.d f7, a1, FENC_STRIDE * 7 fld.d f9, a2, FDEC_STRIDE * 2 fld.d f11, a2, FDEC_STRIDE * 3 fld.d f10, a2, FDEC_STRIDE * 6 fld.d f12, a2, FDEC_STRIDE * 7 vilvl.b vr2, vr8, vr2 vilvl.b vr3, vr8, vr3 vilvl.b vr6, vr8, vr6 vilvl.b vr7, vr8, vr7 vilvl.b vr9, vr8, vr9 vilvl.b vr11, vr8, vr11 vilvl.b vr10, vr8, vr10 vilvl.b vr12, vr8, vr12 vsub.h vr2, vr2, vr9 vsub.h vr6, vr6, vr10 vsub.h vr3, vr3, vr11 vsub.h vr7, vr7, vr12 vadd.h vr9, vr3, vr0 vadd.h vr10, vr7, vr4 vsub.h vr0, vr0, vr3 vsub.h vr4, vr4, vr7 vadd.h vr3, vr2, vr1 vadd.h vr7, vr6, vr5 vsub.h vr1, vr1, vr2 vsub.h vr5, vr5, vr6 vadd.h vr2, vr3, vr9 vadd.h vr6, vr7, vr10 vsub.h vr9, vr9, vr3 vsub.h vr10, vr10, vr7 vsub.h vr3, vr0, vr1 vsub.h vr7, vr4, vr5 vsub.h vr3, vr3, vr1 vsub.h vr7, vr7, vr5 vadd.h vr0, vr0, vr0 vadd.h vr4, vr4, vr4 vadd.h vr0, vr0, vr1 vadd.h vr4, vr4, vr5 vilvh.h vr11, vr0, vr2 vilvh.h vr12, vr4, vr6 vilvl.h vr13, vr0, vr2 vilvl.h vr14, vr4, vr6 vilvh.h vr15, vr3, vr9 vilvh.h vr16, vr7, vr10 vilvl.h vr17, vr3, vr9 vilvl.h vr18, vr7, vr10 vilvh.w vr19, vr17, vr13 vilvh.w vr20, vr18, vr14 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvh.w vr17, vr15, vr11 vilvh.w vr18, vr16, vr12 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvh.d vr0, vr11, vr13 vilvh.d vr4, vr12, vr14 vilvl.d vr2, vr11, vr13 vilvl.d vr6, vr12, vr14 vilvh.d vr1, vr17, vr19 vilvh.d vr5, vr18, vr20 vilvl.d vr3, vr17, vr19 vilvl.d vr7, vr18, vr20 vadd.h vr9, vr1, vr2 vadd.h vr10, vr5, vr6 vsub.h vr2, vr2, vr1 vsub.h vr6, vr6, vr5 vadd.h vr1, vr3, vr0 vadd.h vr5, vr7, vr4 vsub.h vr0, vr0, vr3 vsub.h vr4, vr4, vr7 vadd.h vr3, vr1, vr9 vadd.h vr7, vr5, vr10 vsub.h vr9, vr9, vr1 vsub.h vr10, vr10, vr5 vsub.h vr1, vr2, vr0 vsub.h vr5, vr6, vr4 vsub.h vr1, vr1, vr0 vsub.h vr5, vr5, vr4 vadd.h vr2, vr2, vr2 vadd.h vr6, vr6, vr6 vadd.h vr2, vr2, vr0 vadd.h vr6, vr6, vr4 vilvh.d vr0, vr2, vr3 vilvh.d vr4, vr6, vr7 vilvl.d vr3, vr2, vr3 vilvl.d vr7, vr6, vr7 vilvh.d vr2, vr1, vr9 vilvh.d vr6, vr5, vr10 vilvl.d vr9, vr1, vr9 vilvl.d vr10, vr5, vr10 vor.v vr1, vr3, vr3 vor.v vr5, vr7, vr7 vor.v vr12, vr4, vr4 vst vr3, a0, 0 vst vr9, a0, 16 vst vr0, a0, 32 vst vr2, a0, 48 vst vr5, a0, 64 vst vr10, a0, 80 vst vr12, a0, 96 vst vr6, a0, 112 .endm /* void subwxh_dct( dctcoef*, pixel*, pixel* ) */ function_x264 sub4x4_dct_lsx fld.s f0, a1, 0 fld.s f4, a2, 0 fld.s f1, a1, FENC_STRIDE fld.s f5, a2, FDEC_STRIDE vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.hu.bu vr4, vr4, 0 vsllwil.hu.bu vr5, vr5, 0 fld.s f2, a1, FENC_STRIDE * 2 fld.s f6, a2, FDEC_STRIDE * 2 fld.s f3, a1, FENC_STRIDE * 3 fld.s f7, a2, FDEC_STRIDE * 3 vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr3, vr3, 0 vsllwil.hu.bu vr6, vr6, 0 vsllwil.hu.bu vr7, vr7, 0 vsub.h vr0, vr0, vr4 vsub.h vr1, vr1, vr5 vsub.h vr2, vr2, vr6 vsub.h vr3, vr3, vr7 DCT4_1D_LSX vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr4, vr5, vr6, vr7, vr0, vr1 DCT4_1D_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 vshuf4i.d vr0, vr1, 0x8 vshuf4i.d vr2, vr3, 0x8 vst vr0, a0, 0 vst vr2, a0, 16 endfunc_x264 function_x264 sub8x8_dct_lasx xvxor.v xr8, xr8, xr8 SUB8x8_DCT_CORE_LASX endfunc_x264 function_x264 sub8x8_dct_lsx vxor.v vr8, vr8, vr8 SUB8x8_DCT_CORE_LSX endfunc_x264 function_x264 sub16x16_dct_lasx xvxor.v xr8, xr8, xr8 SUB8x8_DCT_CORE_LASX addi.d a0, a0, 32 * 4 addi.d a1, a1, 8 addi.d a2, a2, 8 SUB8x8_DCT_CORE_LASX addi.d a0, a0, 32 * 4 addi.d a1, a1, 8*FENC_STRIDE - 8 addi.d a2, a2, 8*FDEC_STRIDE - 8 SUB8x8_DCT_CORE_LASX addi.d a0, a0, 32 * 4 addi.d a1, a1, 8 addi.d a2, a2, 8 SUB8x8_DCT_CORE_LASX endfunc_x264 function_x264 sub16x16_dct_lsx vxor.v vr8, vr8, vr8 SUB8x8_DCT_CORE_LSX addi.d a0, a0, 32 * 4 addi.d a1, a1, 8 addi.d a2, a2, 8 SUB8x8_DCT_CORE_LSX addi.d a0, a0, 32 * 4 addi.d a1, a1, 8*FENC_STRIDE - 8 addi.d a2, a2, 8*FDEC_STRIDE - 8 SUB8x8_DCT_CORE_LSX addi.d a0, a0, 32 * 4 addi.d a1, a1, 8 addi.d a2, a2, 8 SUB8x8_DCT_CORE_LSX endfunc_x264 /* * void add4x4_idct( pixel *p_dst, dctcoef dct[16] ) */ function_x264 add4x4_idct_lsx vxor.v vr0, vr1, vr1 fld.d f1, a1, 0 fld.d f2, a1, 8 fld.d f3, a1, 16 fld.d f4, a1, 24 vsrai.h vr5, vr2, 1 vsrai.h vr6, vr4, 1 vilvl.h vr1, vr1, vr3 vilvl.h vr15, vr2, vr6 vilvl.h vr16, vr5, vr4 vhaddw.w.h vr7, vr1, vr1 vhsubw.w.h vr8, vr1, vr1 vhaddw.w.h vr9, vr15, vr15 vhsubw.w.h vr10, vr16, vr16 vadd.w vr1, vr7, vr9 vadd.w vr2, vr8, vr10 vsub.w vr3, vr8, vr10 vsub.w vr4, vr7, vr9 vpickev.h vr1, vr1, vr1 vpickev.h vr2, vr2, vr2 vpickev.h vr3, vr3, vr3 vpickev.h vr4, vr4, vr4 LSX_TRANSPOSE4x4_H vr1, vr2, vr3, vr4, vr1, vr2, vr3, vr4, vr5, vr6 vsrai.h vr5, vr2, 1 vsrai.h vr6, vr4, 1 vilvl.h vr1, vr1, vr3 vilvl.h vr15, vr2, vr6 vilvl.h vr16, vr5, vr4 vhaddw.w.h vr7, vr1, vr1 vhsubw.w.h vr8, vr1, vr1 vhaddw.w.h vr9, vr15, vr15 vhsubw.w.h vr10, vr16, vr16 vadd.w vr1, vr7, vr9 vadd.w vr2, vr8, vr10 vsub.w vr3, vr8, vr10 vsub.w vr4, vr7, vr9 vssrarni.h.w vr2, vr1, 6 vssrarni.h.w vr4, vr3, 6 fld.s f1, a0, 0 fld.s f5, a0, FDEC_STRIDE fld.s f3, a0, FDEC_STRIDE * 2 fld.s f6, a0, FDEC_STRIDE * 3 vilvl.b vr1, vr0, vr1 vilvl.b vr5, vr0, vr5 vilvl.b vr3, vr0, vr3 vilvl.b vr6, vr0, vr6 vilvl.d vr1, vr5, vr1 vilvl.d vr3, vr6, vr3 vadd.h vr7, vr1, vr2 vadd.h vr8, vr3, vr4 vssrarni.bu.h vr8, vr7, 0 vstelm.w vr8, a0, 0, 0 vstelm.w vr8, a0, FDEC_STRIDE, 1 vstelm.w vr8, a0, FDEC_STRIDE * 2, 2 vstelm.w vr8, a0, FDEC_STRIDE * 3, 3 endfunc_x264 .macro LASX_SUMSUB_W sum, diff, in0, in1 xvadd.w \sum, \in0, \in1 xvsub.w \diff, \in0, \in1 .endm .macro add8x4_idct_core_lasx fld.d f1, a1, 0 fld.d f2, a1, 8 fld.d f3, a1, 16 fld.d f4, a1, 24 fld.d f5, a1, 32 fld.d f6, a1, 40 fld.d f7, a1, 48 fld.d f8, a1, 56 xvinsve0.d xr1, xr5, 1 xvinsve0.d xr2, xr6, 1 xvinsve0.d xr3, xr7, 1 xvinsve0.d xr4, xr8, 1 xvsrai.h xr8, xr2, 1 xvsrai.h xr9, xr4, 1 vext2xv.w.h xr1, xr1 vext2xv.w.h xr5, xr2 vext2xv.w.h xr6, xr3 vext2xv.w.h xr7, xr4 vext2xv.w.h xr8, xr8 vext2xv.w.h xr9, xr9 LASX_SUMSUB_W xr10, xr11, xr1, xr6 xvadd.w xr12, xr5, xr9 xvsub.w xr13, xr8, xr7 LASX_SUMSUB_W xr6, xr9, xr10, xr12 LASX_SUMSUB_W xr7, xr8, xr11, xr13 xvpickev.h xr10, xr6, xr6 xvpickev.h xr11, xr7, xr7 xvpickev.h xr12, xr8, xr8 xvpickev.h xr13, xr9, xr9 LASX_TRANSPOSE4x8_H xr10, xr11, xr12, xr13, xr10, xr11, xr12, xr13, \ xr4, xr5 xvsllwil.w.h xr10, xr10, 0 xvsllwil.w.h xr11, xr11, 0 xvsllwil.w.h xr12, xr12, 0 xvsllwil.w.h xr13, xr13, 0 xvsrai.w xr14, xr11, 1 xvsrai.w xr15, xr13, 1 LASX_SUMSUB_W xr4, xr5, xr10, xr12 xvadd.w xr6, xr11, xr15 xvsub.w xr7, xr14, xr13 LASX_SUMSUB_W xr10, xr13, xr4, xr6 LASX_SUMSUB_W xr11, xr12, xr5, xr7 xvssrarni.h.w xr11, xr10, 6 xvssrarni.h.w xr13, xr12, 6 fld.s f1, a0, 0 fld.s f2, a0, FDEC_STRIDE fld.s f3, a0, FDEC_STRIDE * 2 fld.s f4, a0, FDEC_STRIDE * 3 fld.s f5, a0, 4 fld.s f6, a0, FDEC_STRIDE + 4 fld.s f7, a0, FDEC_STRIDE * 2 + 4 fld.s f8, a0, FDEC_STRIDE * 3 + 4 xvinsve0.w xr1, xr2, 1 xvinsve0.w xr3, xr4, 1 xvinsve0.w xr5, xr6, 1 xvinsve0.w xr7, xr8, 1 xvinsve0.d xr1, xr5, 2 xvinsve0.d xr3, xr7, 2 xvilvl.b xr1, xr0, xr1 xvilvl.b xr3, xr0, xr3 xvadd.h xr1, xr1, xr11 xvadd.h xr3, xr3, xr13 xvssrarni.bu.h xr3, xr1, 0 xvstelm.w xr3, a0, 0, 0 xvstelm.w xr3, a0, FDEC_STRIDE, 1 xvstelm.w xr3, a0, FDEC_STRIDE * 2, 2 xvstelm.w xr3, a0, FDEC_STRIDE * 3, 3 xvstelm.w xr3, a0, 4, 4 xvstelm.w xr3, a0, FDEC_STRIDE + 4, 5 xvstelm.w xr3, a0, FDEC_STRIDE * 2 + 4, 6 xvstelm.w xr3, a0, FDEC_STRIDE * 3 + 4, 7 .endm .macro LSX_SUMSUB_W sum0, sum1, diff0, diff1, in0, in1, in2, in3 vadd.w \sum0, \in0, \in2 vadd.w \sum1, \in1, \in3 vsub.w \diff0, \in0, \in2 vsub.w \diff1, \in1, \in3 .endm .macro add8x4_idct_core_lsx fld.d f1, a1, 0 fld.d f2, a1, 8 fld.d f3, a1, 16 fld.d f4, a1, 24 fld.d f5, a1, 32 fld.d f6, a1, 40 fld.d f7, a1, 48 fld.d f8, a1, 56 vpermi.w vr9, vr6, 0x04 vpermi.w vr9, vr2, 0x44 vpermi.w vr10, vr8, 0x04 vpermi.w vr10, vr4, 0x44 vsrai.h vr9, vr9, 1 vsrai.h vr10, vr10, 1 vsllwil.w.h vr1, vr1, 0 vsllwil.w.h vr5, vr5, 0 vsllwil.w.h vr2, vr2, 0 vsllwil.w.h vr6, vr6, 0 vsllwil.w.h vr3, vr3, 0 vsllwil.w.h vr7, vr7, 0 vsllwil.w.h vr4, vr4, 0 vsllwil.w.h vr8, vr8, 0 vexth.w.h vr11, vr9 vsllwil.w.h vr9, vr9, 0 vexth.w.h vr12, vr10 vsllwil.w.h vr10, vr10, 0 LSX_SUMSUB_W vr13, vr14, vr15, vr16, vr1, vr5, vr3, vr7 vadd.w vr17, vr2, vr10 vadd.w vr18, vr6, vr12 vsub.w vr19, vr9, vr4 vsub.w vr20, vr11, vr8 LSX_SUMSUB_W vr3, vr7, vr10, vr12, vr13, vr14, vr17, vr18 LSX_SUMSUB_W vr4, vr8, vr9, vr11, vr15, vr16, vr19, vr20 vpickev.h vr13, vr3, vr3 vpickev.h vr14, vr7, vr7 vpickev.h vr15, vr4, vr4 vpickev.h vr16, vr8, vr8 vpickev.h vr17, vr9, vr9 vpickev.h vr18, vr11, vr11 vpickev.h vr19, vr10, vr10 vpickev.h vr20, vr12, vr12 LSX_TRANSPOSE4x4_H vr13, vr15, vr17, vr19, vr13, vr15, vr17, vr19, vr1, vr3 LSX_TRANSPOSE4x4_H vr14, vr16, vr18, vr20, vr14, vr16, vr18, vr20, vr2, vr4 vsllwil.w.h vr13, vr13, 0 vsllwil.w.h vr14, vr14, 0 vsllwil.w.h vr15, vr15, 0 vsllwil.w.h vr16, vr16, 0 vsllwil.w.h vr17, vr17, 0 vsllwil.w.h vr18, vr18, 0 vsllwil.w.h vr19, vr19, 0 vsllwil.w.h vr20, vr20, 0 vsrai.w vr1, vr15, 1 vsrai.w vr2, vr16, 1 vsrai.w vr3, vr19, 1 vsrai.w vr4, vr20, 1 LSX_SUMSUB_W vr5, vr6, vr21, vr22, vr13, vr14, vr17, vr18 vadd.w vr8, vr15, vr3 vadd.w vr9, vr16, vr4 vsub.w vr10, vr1, vr19 vsub.w vr11, vr2, vr20 LSX_SUMSUB_W vr13, vr14, vr19, vr20, vr5, vr6, vr8, vr9 LSX_SUMSUB_W vr15, vr16, vr17, vr18, vr21, vr22, vr10, vr11 vssrarni.h.w vr15, vr13, 6 vssrarni.h.w vr16, vr14, 6 vssrarni.h.w vr19, vr17, 6 vssrarni.h.w vr20, vr18, 6 fld.s f1, a0, 0 fld.s f2, a0, FDEC_STRIDE fld.s f3, a0, FDEC_STRIDE * 2 fld.s f4, a0, FDEC_STRIDE * 3 fld.s f5, a0, 4 fld.s f6, a0, FDEC_STRIDE + 4 fld.s f7, a0, FDEC_STRIDE * 2 + 4 fld.s f8, a0, FDEC_STRIDE * 3 + 4 vpickve2gr.w t0, vr2, 0 vinsgr2vr.w vr1, t0, 1 vpickve2gr.w t0, vr4, 0 vinsgr2vr.w vr3, t0, 1 vpickve2gr.w t0, vr6, 0 vinsgr2vr.w vr5, t0, 1 vpickve2gr.w t0, vr8, 0 vinsgr2vr.w vr7, t0, 1 vilvl.b vr1, vr0, vr1 vilvl.b vr5, vr0, vr5 vilvl.b vr3, vr0, vr3 vilvl.b vr7, vr0, vr7 vadd.h vr1, vr1, vr15 vadd.h vr5, vr5, vr16 vadd.h vr3, vr3, vr19 vadd.h vr7, vr7, vr20 vssrarni.bu.h vr3, vr1, 0 vssrarni.bu.h vr7, vr5, 0 vstelm.w vr3, a0, 0, 0 vstelm.w vr3, a0, FDEC_STRIDE, 1 vstelm.w vr3, a0, FDEC_STRIDE * 2, 2 vstelm.w vr3, a0, FDEC_STRIDE * 3, 3 vstelm.w vr7, a0, 4, 0 vstelm.w vr7, a0, FDEC_STRIDE + 4, 1 vstelm.w vr7, a0, FDEC_STRIDE * 2 + 4, 2 vstelm.w vr7, a0, FDEC_STRIDE * 3 + 4, 3 .endm /* * void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] ) * */ function_x264 add8x8_idct_lasx xvxor.v xr0, xr1, xr1 add8x4_idct_core_lasx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 64 add8x4_idct_core_lasx endfunc_x264 .macro add8x8_idct_core_lsx add8x4_idct_core_lsx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 64 add8x4_idct_core_lsx .endm function_x264 add8x8_idct_lsx vxor.v vr0, vr1, vr1 add8x8_idct_core_lsx endfunc_x264 /* * void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] ) */ function_x264 add16x16_idct_lasx move t4, a0 move t5, a1 xvxor.v xr0, xr1, xr1 add8x4_idct_core_lasx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 64 add8x4_idct_core_lasx addi.d a0, t4, 8 addi.d a1, t5, 128 add8x4_idct_core_lasx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 64 add8x4_idct_core_lasx addi.d t6, t4, FDEC_STRIDE * 8 move a0, t6 addi.d a1, t5, 256 add8x4_idct_core_lasx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 64 add8x4_idct_core_lasx addi.d a0, t6, 8 addi.d a1, t5, 384 add8x4_idct_core_lasx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 64 add8x4_idct_core_lasx endfunc_x264 function_x264 add16x16_idct_lsx move t4, a0 move t5, a1 vxor.v vr0, vr1, vr1 add8x8_idct_core_lsx addi.d a0, t4, 8 addi.d a1, t5, 128 add8x8_idct_core_lsx addi.d t6, t4, FDEC_STRIDE * 8 move a0, t6 addi.d a1, t5, 256 add8x8_idct_core_lsx addi.d a0, t6, 8 addi.d a1, t5, 384 add8x8_idct_core_lsx endfunc_x264 /* * void add8x8_idct8( pixel *dst, dctcoef dct[64] ) */ function_x264 add8x8_idct8_lasx xvxor.v xr20, xr1, xr1 // dct[0] += 32 ld.h t0, a1, 0 addi.w t0, t0, 32 st.h t0, a1, 0 vld vr0, a1, 0 vld vr2, a1, 32 vld vr4, a1, 64 vld vr6, a1, 96 vsrai.h vr8, vr2, 1 vsrai.h vr10, vr6, 1 vext2xv.w.h xr0, xr0 vext2xv.w.h xr2, xr2 vext2xv.w.h xr4, xr4 vext2xv.w.h xr6, xr6 vext2xv.w.h xr8, xr8 vext2xv.w.h xr10, xr10 LASX_SUMSUB_W xr11, xr12, xr0, xr4 xvsub.w xr13, xr8, xr6 xvadd.w xr14, xr10, xr2 LASX_SUMSUB_W xr15, xr18, xr11, xr14 LASX_SUMSUB_W xr16, xr17, xr12, xr13 vld vr0, a1, 16 vld vr2, a1, 48 vld vr4, a1, 80 vld vr6, a1, 112 vsrai.h vr1, vr0, 1 vsrai.h vr3, vr2, 1 vsrai.h vr5, vr4, 1 vsrai.h vr7, vr6, 1 vext2xv.w.h xr0, xr0 vext2xv.w.h xr2, xr2 vext2xv.w.h xr4, xr4 vext2xv.w.h xr6, xr6 vext2xv.w.h xr1, xr1 vext2xv.w.h xr3, xr3 vext2xv.w.h xr5, xr5 vext2xv.w.h xr7, xr7 LASX_SUMSUB_W xr9, xr10, xr4, xr2 LASX_SUMSUB_W xr11, xr12, xr6, xr0 xvsub.w xr10, xr10, xr6 xvsub.w xr10, xr10, xr7 xvsub.w xr11, xr11, xr2 xvsub.w xr11, xr11, xr3 xvadd.w xr12, xr12, xr4 xvadd.w xr12, xr12, xr5 xvadd.w xr9, xr9, xr0 xvadd.w xr9, xr9, xr1 xvsrai.w xr1, xr10, 2 xvsrai.w xr2, xr11, 2 xvsrai.w xr3, xr12, 2 xvsrai.w xr4, xr9, 2 xvadd.w xr5, xr4, xr10 xvadd.w xr6, xr3, xr11 xvsub.w xr7, xr2, xr12 xvsub.w xr8, xr9, xr1 LASX_SUMSUB_W xr1, xr14, xr15, xr8 LASX_SUMSUB_W xr2, xr13, xr16, xr7 LASX_SUMSUB_W xr3, xr12, xr17, xr6 LASX_SUMSUB_W xr4, xr11, xr18, xr5 LASX_TRANSPOSE8x8_W xr1, xr2, xr3, xr4, xr11, xr12, xr13, xr14, \ xr5, xr6, xr7, xr8, xr15, xr16, xr17, xr18, \ xr9, xr10, xr21, xr22 xvsrai.h xr9, xr7, 1 xvsrai.h xr10, xr17, 1 xvaddwev.w.h xr1, xr5, xr15 xvsubwev.w.h xr2, xr5, xr15 xvsubwev.w.h xr3, xr9, xr17 xvaddwev.w.h xr4, xr10, xr7 LASX_SUMSUB_W xr11, xr14, xr1, xr4 LASX_SUMSUB_W xr12, xr13, xr2, xr3 xvsrai.h xr1, xr6, 1 xvsrai.h xr2, xr8, 1 xvsrai.h xr3, xr16, 1 xvsrai.h xr4, xr18, 1 xvaddwev.w.h xr5, xr16, xr8 xvsubwev.w.h xr10, xr16, xr8 xvaddwev.w.h xr7, xr18, xr6 xvsubwev.w.h xr9, xr18, xr6 xvaddwev.w.h xr4, xr18, xr4 xvsub.w xr10, xr10, xr4 xvaddwev.w.h xr2, xr8, xr2 xvsub.w xr7, xr7, xr2 xvaddwev.w.h xr3, xr16, xr3 xvadd.w xr9, xr9, xr3 xvaddwev.w.h xr1, xr6, xr1 xvadd.w xr5, xr5, xr1 xvsrai.w xr1, xr10, 2 xvsrai.w xr2, xr7, 2 xvsrai.w xr3, xr9, 2 xvsrai.w xr4, xr5, 2 xvadd.w xr15, xr4, xr10 xvadd.w xr16, xr7, xr3 xvsub.w xr17, xr2, xr9 xvsub.w xr18, xr5, xr1 LASX_SUMSUB_W xr1, xr8, xr11, xr18 LASX_SUMSUB_W xr2, xr7, xr12, xr17 LASX_SUMSUB_W xr3, xr6, xr13, xr16 LASX_SUMSUB_W xr4, xr5, xr14, xr15 xvsrai.w xr11, xr1, 6 xvsrai.w xr12, xr2, 6 xvsrai.w xr13, xr3, 6 xvsrai.w xr14, xr4, 6 xvsrai.w xr15, xr5, 6 xvsrai.w xr16, xr6, 6 xvsrai.w xr17, xr7, 6 xvsrai.w xr18, xr8, 6 fld.d f1, a0, 0 fld.d f2, a0, FDEC_STRIDE fld.d f3, a0, FDEC_STRIDE * 2 fld.d f4, a0, FDEC_STRIDE * 3 fld.d f5, a0, FDEC_STRIDE * 4 fld.d f6, a0, FDEC_STRIDE * 5 fld.d f7, a0, FDEC_STRIDE * 6 fld.d f8, a0, FDEC_STRIDE * 7 vext2xv.wu.bu xr1, xr1 vext2xv.wu.bu xr2, xr2 vext2xv.wu.bu xr3, xr3 vext2xv.wu.bu xr4, xr4 vext2xv.wu.bu xr5, xr5 vext2xv.wu.bu xr6, xr6 vext2xv.wu.bu xr7, xr7 vext2xv.wu.bu xr8, xr8 xvadd.w xr1, xr1, xr11 xvadd.w xr2, xr2, xr12 xvadd.w xr3, xr3, xr13 xvadd.w xr4, xr4, xr14 xvadd.w xr5, xr5, xr15 xvadd.w xr6, xr6, xr16 xvadd.w xr7, xr7, xr17 xvadd.w xr8, xr8, xr18 xvssrarni.hu.w xr2, xr1, 0 xvssrarni.hu.w xr4, xr3, 0 xvssrarni.hu.w xr6, xr5, 0 xvssrarni.hu.w xr8, xr7, 0 xvpermi.d xr12, xr2, 0xd8 xvpermi.d xr14, xr4, 0xd8 xvpermi.d xr16, xr6, 0xd8 xvpermi.d xr18, xr8, 0xd8 xvssrlni.bu.h xr14, xr12, 0 xvssrlni.bu.h xr18, xr16, 0 xvstelm.d xr14, a0, 0, 0 xvstelm.d xr14, a0, FDEC_STRIDE, 2 xvstelm.d xr14, a0, FDEC_STRIDE * 2, 1 xvstelm.d xr14, a0, FDEC_STRIDE * 3, 3 xvstelm.d xr18, a0, FDEC_STRIDE * 4, 0 xvstelm.d xr18, a0, FDEC_STRIDE * 5, 2 xvstelm.d xr18, a0, FDEC_STRIDE * 6, 1 xvstelm.d xr18, a0, FDEC_STRIDE * 7, 3 endfunc_x264 function_x264 add8x8_idct8_lsx ld.h t0, a1, 0 addi.w t0, t0, 32 st.h t0, a1, 0 vld vr0, a1, 0 vld vr2, a1, 32 vld vr4, a1, 64 vld vr6, a1, 96 vsrai.h vr8, vr2, 1 vsrai.h vr10, vr6, 1 vexth.w.h vr1, vr0 vsllwil.w.h vr0, vr0, 0 vexth.w.h vr3, vr2 vsllwil.w.h vr2, vr2, 0 vexth.w.h vr5, vr4 vsllwil.w.h vr4, vr4, 0 vexth.w.h vr7, vr6 vsllwil.w.h vr6, vr6, 0 vexth.w.h vr9, vr8 vsllwil.w.h vr8, vr8, 0 vexth.w.h vr11, vr10 vsllwil.w.h vr10, vr10, 0 LSX_SUMSUB_W vr12, vr13, vr14, vr15, vr0, vr1, vr4, vr5 vsub.w vr16, vr8, vr6 vsub.w vr17, vr9, vr7 vadd.w vr18, vr10, vr2 vadd.w vr19, vr11, vr3 LSX_SUMSUB_W vr20, vr21, vr18, vr19, vr12, vr13, vr18, vr19 LSX_SUMSUB_W vr22, vr23, vr16, vr17, vr14, vr15, vr16, vr17 vld vr0, a1, 16 vld vr2, a1, 48 vld vr4, a1, 80 vld vr6, a1, 112 vsrai.h vr1, vr0, 1 vsrai.h vr3, vr2, 1 vsrai.h vr5, vr4, 1 vsrai.h vr7, vr6, 1 vexth.w.h vr8, vr0 vsllwil.w.h vr0, vr0, 0 vexth.w.h vr10, vr2 vsllwil.w.h vr2, vr2, 0 vexth.w.h vr12, vr4 vsllwil.w.h vr4, vr4, 0 vexth.w.h vr14, vr6 vsllwil.w.h vr6, vr6, 0 vexth.w.h vr9, vr1 vsllwil.w.h vr1, vr1, 0 vexth.w.h vr11, vr3 vsllwil.w.h vr3, vr3, 0 vexth.w.h vr13, vr5 vsllwil.w.h vr5, vr5, 0 vexth.w.h vr15, vr7 vsllwil.w.h vr7, vr7, 0 addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 LSX_SUMSUB_W vr24, vr25, vr26, vr27, vr4, vr12, vr2, vr10 LSX_SUMSUB_W vr28, vr29, vr30, vr31, vr6, vr14, vr0, vr8 vsub.w vr26, vr26, vr6 vsub.w vr27, vr27, vr14 vsub.w vr26, vr26, vr7 vsub.w vr27, vr27, vr15 vsub.w vr28, vr28, vr2 vsub.w vr29, vr29, vr10 vsub.w vr28, vr28, vr3 vsub.w vr29, vr29, vr11 vadd.w vr30, vr30, vr4 vadd.w vr31, vr31, vr12 vadd.w vr30, vr30, vr5 vadd.w vr31, vr31, vr13 vadd.w vr24, vr24, vr0 vadd.w vr25, vr25, vr8 vadd.w vr24, vr24, vr1 vadd.w vr25, vr25, vr9 vsrai.w vr1, vr26, 2 vsrai.w vr9, vr27, 2 vsrai.w vr2, vr28, 2 vsrai.w vr10, vr29, 2 vsrai.w vr3, vr30, 2 vsrai.w vr11, vr31, 2 vsrai.w vr4, vr24, 2 vsrai.w vr12, vr25, 2 vadd.w vr5, vr4, vr26 vadd.w vr13, vr12, vr27 vadd.w vr6, vr3, vr28 vadd.w vr14, vr11, vr29 vsub.w vr7, vr2, vr30 vsub.w vr15, vr10, vr31 vsub.w vr0, vr24, vr1 vsub.w vr8, vr25, vr9 LSX_SUMSUB_W vr1, vr9, vr30, vr31, vr20, vr21, vr0, vr8 LSX_SUMSUB_W vr2, vr10, vr28, vr29, vr22, vr23, vr7, vr15 LSX_SUMSUB_W vr3, vr11, vr26, vr27, vr16, vr17, vr6, vr14 LSX_SUMSUB_W vr4, vr12, vr24, vr25, vr18, vr19, vr5, vr13 LSX_TRANSPOSE4x4_W vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr20, vr22 LSX_TRANSPOSE4x4_W vr9, vr10, vr11, vr12, vr20, vr22, vr16, vr18, vr1, vr2 LSX_TRANSPOSE4x4_W vr24, vr26, vr28, vr30, vr13, vr14, vr15, vr8, vr21, vr23 LSX_TRANSPOSE4x4_W vr25, vr27, vr29, vr31, vr21, vr23, vr17, vr19, vr24, vr26 vsrai.h vr3, vr7, 1 vsrai.h vr11, vr15, 1 vsrai.h vr4, vr16, 1 vsrai.h vr12, vr17, 1 vaddwev.w.h vr1, vr5, vr20 vaddwev.w.h vr9, vr13, vr21 vsubwev.w.h vr2, vr5, vr20 vsubwev.w.h vr10, vr13, vr21 vsubwev.w.h vr3, vr3, vr16 vsubwev.w.h vr11, vr11, vr17 vaddwev.w.h vr4, vr4, vr7 vaddwev.w.h vr12, vr12, vr15 LSX_SUMSUB_W vr24, vr25, vr30, vr31, vr1, vr9, vr4, vr12 LSX_SUMSUB_W vr26, vr27, vr28, vr29, vr2, vr10, vr3, vr11 vsrai.h vr1, vr6, 1 vsrai.h vr9, vr14, 1 vsrai.h vr2, vr0, 1 vsrai.h vr10, vr8, 1 vsrai.h vr3, vr22, 1 vsrai.h vr11, vr23, 1 vsrai.h vr4, vr18, 1 vsrai.h vr12, vr19, 1 vaddwev.w.h vr5, vr22, vr0 vaddwev.w.h vr13, vr23, vr8 vsubwev.w.h vr20, vr22, vr0 vsubwev.w.h vr21, vr23, vr8 vaddwev.w.h vr7, vr18, vr6 vaddwev.w.h vr15, vr19, vr14 vsubwev.w.h vr16, vr18, vr6 vsubwev.w.h vr17, vr19, vr14 vaddwev.w.h vr4, vr18, vr4 vaddwev.w.h vr12, vr19, vr12 vsub.w vr20, vr20, vr4 vsub.w vr21, vr21, vr12 vaddwev.w.h vr2, vr0, vr2 vaddwev.w.h vr10, vr8, vr10 vsub.w vr7, vr7, vr2 vsub.w vr15, vr15, vr10 vaddwev.w.h vr3, vr22, vr3 vaddwev.w.h vr11, vr23, vr11 vadd.w vr16, vr16, vr3 vadd.w vr17, vr17, vr11 vaddwev.w.h vr1, vr6, vr1 vaddwev.w.h vr9, vr14, vr9 vadd.w vr5, vr5, vr1 vadd.w vr13, vr13, vr9 vsrai.w vr1, vr20, 2 vsrai.w vr9, vr21, 2 vsrai.w vr2, vr7, 2 vsrai.w vr10, vr15, 2 vsrai.w vr3, vr16, 2 vsrai.w vr11, vr17, 2 vsrai.w vr4, vr5, 2 vsrai.w vr12, vr13, 2 vadd.w vr20, vr4, vr20 vadd.w vr21, vr12, vr21 vadd.w vr22, vr7, vr3 vadd.w vr23, vr15, vr11 vsub.w vr16, vr2, vr16 vsub.w vr17, vr10, vr17 vsub.w vr18, vr5, vr1 vsub.w vr19, vr13, vr9 LSX_SUMSUB_W vr1, vr9, vr0, vr8, vr24, vr25, vr18, vr19 LSX_SUMSUB_W vr2, vr10, vr7, vr15, vr26, vr27, vr16, vr17 LSX_SUMSUB_W vr3, vr11, vr6, vr14, vr28, vr29, vr22, vr23 LSX_SUMSUB_W vr4, vr12, vr5, vr13, vr30, vr31, vr20, vr21 vsrai.w vr24, vr1, 6 vsrai.w vr25, vr9, 6 vsrai.w vr26, vr2, 6 vsrai.w vr27, vr10, 6 vsrai.w vr28, vr3, 6 vsrai.w vr29, vr11, 6 vsrai.w vr30, vr4, 6 vsrai.w vr31, vr12, 6 vsrai.w vr20, vr5, 6 vsrai.w vr21, vr13, 6 vsrai.w vr22, vr6, 6 vsrai.w vr23, vr14, 6 vsrai.w vr16, vr7, 6 vsrai.w vr17, vr15, 6 vsrai.w vr18, vr0, 6 vsrai.w vr19, vr8, 6 fld.d f1, a0, 0 fld.d f2, a0, FDEC_STRIDE fld.d f3, a0, FDEC_STRIDE * 2 fld.d f4, a0, FDEC_STRIDE * 3 fld.d f5, a0, FDEC_STRIDE * 4 fld.d f6, a0, FDEC_STRIDE * 5 fld.d f7, a0, FDEC_STRIDE * 6 fld.d f8, a0, FDEC_STRIDE * 7 vsllwil.hu.bu vr1, vr1, 0 vexth.wu.hu vr9, vr1 vsllwil.wu.hu vr1, vr1, 0 vsllwil.hu.bu vr2, vr2, 0 vexth.wu.hu vr10, vr2 vsllwil.wu.hu vr2, vr2, 0 vsllwil.hu.bu vr3, vr3, 0 vexth.wu.hu vr11, vr3 vsllwil.wu.hu vr3, vr3, 0 vsllwil.hu.bu vr4, vr4, 0 vexth.wu.hu vr12, vr4 vsllwil.wu.hu vr4, vr4, 0 vsllwil.hu.bu vr5, vr5, 0 vexth.wu.hu vr13, vr5 vsllwil.wu.hu vr5, vr5, 0 vsllwil.hu.bu vr6, vr6, 0 vexth.wu.hu vr14, vr6 vsllwil.wu.hu vr6, vr6, 0 vsllwil.hu.bu vr7, vr7, 0 vexth.wu.hu vr15, vr7 vsllwil.wu.hu vr7, vr7, 0 vsllwil.hu.bu vr8, vr8, 0 vexth.wu.hu vr0, vr8 vsllwil.wu.hu vr8, vr8, 0 vadd.w vr1, vr1, vr24 vadd.w vr9, vr9, vr25 vadd.w vr2, vr2, vr26 vadd.w vr10, vr10, vr27 vadd.w vr3, vr3, vr28 vadd.w vr11, vr11, vr29 vadd.w vr4, vr4, vr30 vadd.w vr12, vr12, vr31 vadd.w vr5, vr5, vr20 vadd.w vr13, vr13, vr21 vadd.w vr6, vr6, vr22 vadd.w vr14, vr14, vr23 vadd.w vr7, vr7, vr16 vadd.w vr15, vr15, vr17 vadd.w vr8, vr8, vr18 vadd.w vr0, vr0, vr19 vssrarni.hu.w vr2, vr1, 0 vssrarni.hu.w vr10, vr9, 0 vssrarni.hu.w vr4, vr3, 0 vssrarni.hu.w vr12, vr11, 0 vssrarni.hu.w vr6, vr5, 0 vssrarni.hu.w vr14, vr13, 0 vssrarni.hu.w vr8, vr7, 0 vssrarni.hu.w vr0, vr15, 0 vpermi.w vr20, vr10, 0x0E vpermi.w vr10, vr2, 0x44 vpermi.w vr20, vr2, 0x4E vpermi.w vr21, vr12, 0x0E vpermi.w vr12, vr4, 0x44 vpermi.w vr21, vr4, 0x4E vpermi.w vr22, vr14, 0x0E vpermi.w vr14, vr6, 0x44 vpermi.w vr22, vr6, 0x4E vpermi.w vr23, vr0, 0x0E vpermi.w vr0, vr8, 0x44 vpermi.w vr23, vr8, 0x4E vssrlni.bu.h vr12, vr10, 0 vssrlni.bu.h vr21, vr20, 0 vssrlni.bu.h vr0, vr14, 0 vssrlni.bu.h vr23, vr22, 0 vstelm.d vr12, a0, 0, 0 vstelm.d vr21, a0, FDEC_STRIDE, 0 vstelm.d vr12, a0, FDEC_STRIDE * 2, 1 vstelm.d vr21, a0, FDEC_STRIDE * 3, 1 vstelm.d vr0, a0, FDEC_STRIDE * 4, 0 vstelm.d vr23, a0, FDEC_STRIDE * 5, 0 vstelm.d vr0, a0, FDEC_STRIDE * 6, 1 vstelm.d vr23, a0, FDEC_STRIDE * 7, 1 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 .macro add8x4_idct_dc_lasx xvldrepl.h xr11, a1, 0 xvldrepl.h xr12, a1, 2 xvilvl.d xr12, xr12, xr11 xvsrari.h xr12, xr12, 6 fld.d f0, a0, 0 fld.d f1, a0, FDEC_STRIDE fld.d f2, a0, FDEC_STRIDE * 2 fld.d f3, a0, FDEC_STRIDE * 3 xvinsve0.d xr0, xr1, 1 xvinsve0.d xr2, xr3, 1 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr2, xr2 xvadd.h xr0, xr0, xr12 xvadd.h xr2, xr2, xr12 xvssrarni.bu.h xr2, xr0, 0 xvstelm.d xr2, a0, 0, 0 xvstelm.d xr2, a0, FDEC_STRIDE, 2 xvstelm.d xr2, a0, FDEC_STRIDE * 2, 1 xvstelm.d xr2, a0, FDEC_STRIDE * 3, 3 .endm .macro add8x4_idct_dc_lsx vldrepl.h vr11, a1, 0 vldrepl.h vr12, a1, 2 vilvl.d vr12, vr12, vr11 vsrari.h vr12, vr12, 6 fld.d f0, a0, 0 fld.d f1, a0, FDEC_STRIDE fld.d f2, a0, FDEC_STRIDE * 2 fld.d f3, a0, FDEC_STRIDE * 3 vsllwil.hu.bu vr0, vr0, 0 vsllwil.hu.bu vr1, vr1, 0 vsllwil.hu.bu vr2, vr2, 0 vsllwil.hu.bu vr3, vr3, 0 vadd.h vr0, vr0, vr12 vadd.h vr1, vr1, vr12 vadd.h vr2, vr2, vr12 vadd.h vr3, vr3, vr12 vssrarni.bu.h vr2, vr0, 0 vssrarni.bu.h vr3, vr1, 0 vstelm.d vr2, a0, 0, 0 vstelm.d vr3, a0, FDEC_STRIDE, 0 vstelm.d vr2, a0, FDEC_STRIDE * 2, 1 vstelm.d vr3, a0, FDEC_STRIDE * 3, 1 .endm /* * void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] ) */ function_x264 add8x8_idct_dc_lasx add8x4_idct_dc_lasx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 4 add8x4_idct_dc_lasx endfunc_x264 function_x264 add8x8_idct_dc_lsx add8x4_idct_dc_lsx addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 4 add8x4_idct_dc_lsx endfunc_x264 .macro add_16x16_idct_dc_core_lasx a0, a1 vldrepl.h vr11, \a1, 0 vldrepl.h vr12, \a1, 2 vldrepl.h vr13, \a1, 4 vldrepl.h vr14, \a1, 6 xvinsve0.d xr11, xr12, 1 xvinsve0.d xr11, xr13, 2 xvinsve0.d xr11, xr14, 3 xvsrari.h xr11, xr11, 6 vld vr0, \a0, 0 vld vr1, \a0, FDEC_STRIDE vld vr2, \a0, FDEC_STRIDE * 2 vld vr3, \a0, FDEC_STRIDE * 3 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 xvadd.h xr0, xr0, xr11 xvadd.h xr1, xr1, xr11 xvadd.h xr2, xr2, xr11 xvadd.h xr3, xr3, xr11 xvssrarni.bu.h xr1, xr0, 0 xvssrarni.bu.h xr3, xr2, 0 xvpermi.d xr4, xr1, 0xD8 xvpermi.d xr5, xr1, 0x8D xvpermi.d xr6, xr3, 0xD8 xvpermi.d xr7, xr3, 0x8D vst vr4, \a0, 0 vst vr5, \a0, FDEC_STRIDE vst vr6, \a0, FDEC_STRIDE * 2 vst vr7, \a0, FDEC_STRIDE * 3 .endm /* * void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] ) */ function_x264 add16x16_idct_dc_lasx add_16x16_idct_dc_core_lasx a0, a1 addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 8 add_16x16_idct_dc_core_lasx a0, a1 addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 8 add_16x16_idct_dc_core_lasx a0, a1 addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 8 add_16x16_idct_dc_core_lasx a0, a1 endfunc_x264 .macro add_16x16_idct_dc_core_lsx a0, a1 vldrepl.h vr11, \a1, 0 vldrepl.h vr12, \a1, 2 vldrepl.h vr13, \a1, 4 vldrepl.h vr14, \a1, 6 vpermi.w vr12, vr11, 0x44 vpermi.w vr14, vr13, 0x44 vsrari.h vr12, vr12, 6 vsrari.h vr14, vr14, 6 vld vr0, \a0, 0 vld vr1, \a0, FDEC_STRIDE vld vr2, \a0, FDEC_STRIDE * 2 vld vr3, \a0, FDEC_STRIDE * 3 vexth.hu.bu vr5, vr0 vsllwil.hu.bu vr0, vr0, 0 vexth.hu.bu vr6, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr7, vr2 vsllwil.hu.bu vr2, vr2, 0 vexth.hu.bu vr8, vr3 vsllwil.hu.bu vr3, vr3, 0 vadd.h vr0, vr0, vr12 vadd.h vr5, vr5, vr14 vadd.h vr1, vr1, vr12 vadd.h vr6, vr6, vr14 vadd.h vr2, vr2, vr12 vadd.h vr7, vr7, vr14 vadd.h vr3, vr3, vr12 vadd.h vr8, vr8, vr14 vssrarni.bu.h vr1, vr0, 0 vssrarni.bu.h vr6, vr5, 0 vssrarni.bu.h vr3, vr2, 0 vssrarni.bu.h vr8, vr7, 0 vpermi.w vr9, vr6, 0x0E vpermi.w vr6, vr1, 0x44 vpermi.w vr9, vr1, 0x4E vpermi.w vr10, vr8, 0x0E vpermi.w vr8, vr3, 0x44 vpermi.w vr10, vr3, 0x4E vst vr6, \a0, 0 vst vr9, \a0, FDEC_STRIDE vst vr8, \a0, FDEC_STRIDE * 2 vst vr10, \a0, FDEC_STRIDE * 3 .endm function_x264 add16x16_idct_dc_lsx add_16x16_idct_dc_core_lsx a0, a1 addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 8 add_16x16_idct_dc_core_lsx a0, a1 addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 8 add_16x16_idct_dc_core_lsx a0, a1 addi.d a0, a0, FDEC_STRIDE * 4 addi.d a1, a1, 8 add_16x16_idct_dc_core_lsx a0, a1 endfunc_x264 /* * void idct4x4dc( dctcoef d[16] ) */ function_x264 idct4x4dc_lasx la.local t0, last64_shuf xvld xr0, a0, 0 xvld xr20, t0, 0 xvshuf4i.b xr1, xr0, 0x4E xvhaddw.w.h xr2, xr0, xr0 xvhsubw.w.h xr3, xr1, xr1 xvshuf4i.h xr2, xr2, 0x4E xvshuf4i.h xr3, xr3, 0x4E xvhaddw.d.w xr4, xr2, xr2 xvhsubw.d.w xr5, xr2, xr2 xvhsubw.d.w xr6, xr3, xr3 xvhaddw.d.w xr7, xr3, xr3 xvpickev.w xr8, xr5, xr4 xvpickev.w xr9, xr7, xr6 xvpickev.h xr10, xr9, xr8 xvperm.w xr10, xr10, xr20 xvshuf4i.b xr11, xr10, 0x4E xvhaddw.w.h xr12, xr10, xr10 xvhsubw.w.h xr13, xr11, xr11 xvshuf4i.h xr12, xr12, 0x4E xvshuf4i.h xr13, xr13, 0x4E xvhaddw.d.w xr14, xr12, xr12 xvhsubw.d.w xr15, xr12, xr12 xvhsubw.d.w xr16, xr13, xr13 xvhaddw.d.w xr17, xr13, xr13 xvpackev.w xr18, xr15, xr14 xvpackev.w xr19, xr17, xr16 xvilvl.d xr0, xr19, xr18 xvilvh.d xr1, xr19, xr18 xvpickev.h xr2, xr1, xr0 xvst xr2, a0, 0 endfunc_x264 function_x264 idct4x4dc_lsx vld vr0, a0, 0 vld vr20, a0, 16 vshuf4i.b vr1, vr0, 0x4E vshuf4i.b vr11, vr20, 0x4E vhaddw.w.h vr2, vr0, vr0 vhaddw.w.h vr12, vr20, vr20 vhsubw.w.h vr3, vr1, vr1 vhsubw.w.h vr13, vr11, vr11 vshuf4i.h vr2, vr2, 0x4E vshuf4i.h vr12, vr12, 0x4E vshuf4i.h vr3, vr3, 0x4E vshuf4i.h vr13, vr13, 0x4E vhaddw.d.w vr4, vr2, vr2 vhaddw.d.w vr14, vr12, vr12 vhsubw.d.w vr5, vr2, vr2 vhsubw.d.w vr15, vr12, vr12 vhsubw.d.w vr6, vr3, vr3 vhsubw.d.w vr16, vr13, vr13 vhaddw.d.w vr7, vr3, vr3 vhaddw.d.w vr17, vr13, vr13 vpickev.w vr8, vr5, vr4 vpickev.w vr18, vr15, vr14 vpickev.w vr9, vr7, vr6 vpickev.w vr19, vr17, vr16 vpickev.h vr10, vr9, vr8 vpickev.h vr21, vr19, vr18 vpermi.w vr22, vr21, 0x0E vpermi.w vr21, vr10, 0x44 vpermi.w vr22, vr10, 0x4E vpermi.w vr21, vr21, 0xD8 vpermi.w vr22, vr22, 0xD8 vshuf4i.b vr11, vr21, 0x4E vshuf4i.b vr12, vr22, 0x4E vhaddw.w.h vr21, vr21, vr21 vhaddw.w.h vr22, vr22, vr22 vhsubw.w.h vr11, vr11, vr11 vhsubw.w.h vr12, vr12, vr12 vshuf4i.h vr21, vr21, 0x4E vshuf4i.h vr22, vr22, 0x4E vshuf4i.h vr11, vr11, 0x4E vshuf4i.h vr12, vr12, 0x4E vhaddw.d.w vr13, vr21, vr21 vhaddw.d.w vr14, vr22, vr22 vhsubw.d.w vr15, vr21, vr21 vhsubw.d.w vr16, vr22, vr22 vhsubw.d.w vr17, vr11, vr11 vhsubw.d.w vr18, vr12, vr12 vhaddw.d.w vr19, vr11, vr11 vhaddw.d.w vr20, vr12, vr12 vpackev.w vr7, vr15, vr13 vpackev.w vr8, vr16, vr14 vpackev.w vr9, vr19, vr17 vpackev.w vr10, vr20, vr18 vilvl.d vr0, vr9, vr7 vilvl.d vr4, vr10, vr8 vilvh.d vr1, vr9, vr7 vilvh.d vr5, vr10, vr8 vpickev.h vr2, vr1, vr0 vpickev.h vr3, vr5, vr4 vst vr2, a0, 0 vst vr3, a0, 16 endfunc_x264 /* * void dct4x4dc( dctcoef d[16] ) */ function_x264 dct4x4dc_lasx la.local t0, last64_shuf xvld xr0, a0, 0 xvld xr20, t0, 0 xvshuf4i.b xr1, xr0, 0x4E xvhaddw.w.h xr2, xr0, xr0 xvhsubw.w.h xr3, xr1, xr1 xvshuf4i.h xr2, xr2, 0x4E xvshuf4i.h xr3, xr3, 0x4E xvhaddw.d.w xr4, xr2, xr2 xvhsubw.d.w xr5, xr2, xr2 xvhsubw.d.w xr6, xr3, xr3 xvhaddw.d.w xr7, xr3, xr3 xvpickev.w xr8, xr5, xr4 xvpickev.w xr9, xr7, xr6 xvpickev.h xr10, xr9, xr8 xvperm.w xr10, xr10, xr20 xvshuf4i.b xr11, xr10, 0x4E xvhaddw.w.h xr12, xr10, xr10 xvhsubw.w.h xr13, xr11, xr11 xvshuf4i.h xr12, xr12, 0x4E xvshuf4i.h xr13, xr13, 0x4E xvhaddw.d.w xr14, xr12, xr12 xvhsubw.d.w xr15, xr12, xr12 xvhsubw.d.w xr16, xr13, xr13 xvhaddw.d.w xr17, xr13, xr13 xvpackev.w xr18, xr15, xr14 xvpackev.w xr19, xr17, xr16 xvsrari.w xr18, xr18, 1 xvsrari.w xr19, xr19, 1 xvilvl.d xr0, xr19, xr18 xvilvh.d xr1, xr19, xr18 xvpickev.h xr2, xr1, xr0 xvst xr2, a0, 0 endfunc_x264 function_x264 dct4x4dc_lsx vld vr0, a0, 0 vld vr20, a0, 16 vshuf4i.b vr1, vr0, 0x4E vshuf4i.b vr11, vr20, 0x4E vhaddw.w.h vr2, vr0, vr0 vhaddw.w.h vr12, vr20, vr20 vhsubw.w.h vr3, vr1, vr1 vhsubw.w.h vr13, vr11, vr11 vshuf4i.h vr2, vr2, 0x4E vshuf4i.h vr12, vr12, 0x4E vshuf4i.h vr3, vr3, 0x4E vshuf4i.h vr13, vr13, 0x4E vhaddw.d.w vr4, vr2, vr2 vhaddw.d.w vr14, vr12, vr12 vhsubw.d.w vr5, vr2, vr2 vhsubw.d.w vr15, vr12, vr12 vhsubw.d.w vr6, vr3, vr3 vhsubw.d.w vr16, vr13, vr13 vhaddw.d.w vr7, vr3, vr3 vhaddw.d.w vr17, vr13, vr13 vpickev.w vr8, vr5, vr4 vpickev.w vr18, vr15, vr14 vpickev.w vr9, vr7, vr6 vpickev.w vr19, vr17, vr16 vpickev.h vr10, vr9, vr8 vpickev.h vr21, vr19, vr18 vpermi.w vr22, vr21, 0x0E vpermi.w vr21, vr10, 0x44 vpermi.w vr22, vr10, 0x4E vpermi.w vr21, vr21, 0xD8 vpermi.w vr22, vr22, 0xD8 vshuf4i.b vr11, vr21, 0x4E vshuf4i.b vr12, vr22, 0x4E vhaddw.w.h vr21, vr21, vr21 vhaddw.w.h vr22, vr22, vr22 vhsubw.w.h vr11, vr11, vr11 vhsubw.w.h vr12, vr12, vr12 vshuf4i.h vr21, vr21, 0x4E vshuf4i.h vr22, vr22, 0x4E vshuf4i.h vr11, vr11, 0x4E vshuf4i.h vr12, vr12, 0x4E vhaddw.d.w vr13, vr21, vr21 vhaddw.d.w vr14, vr22, vr22 vhsubw.d.w vr15, vr21, vr21 vhsubw.d.w vr16, vr22, vr22 vhsubw.d.w vr17, vr11, vr11 vhsubw.d.w vr18, vr12, vr12 vhaddw.d.w vr19, vr11, vr11 vhaddw.d.w vr20, vr12, vr12 vpackev.w vr7, vr15, vr13 vpackev.w vr8, vr16, vr14 vpackev.w vr9, vr19, vr17 vpackev.w vr10, vr20, vr18 vsrari.w vr7, vr7, 1 vsrari.w vr8, vr8, 1 vsrari.w vr9, vr9, 1 vsrari.w vr10, vr10, 1 vilvl.d vr0, vr9, vr7 vilvl.d vr4, vr10, vr8 vilvh.d vr1, vr9, vr7 vilvh.d vr10, vr10, vr8 vpickev.h vr2, vr1, vr0 vpickev.h vr3, vr10, vr4 vst vr2, a0, 0 vst vr3, a0, 16 endfunc_x264 .macro LSX_LOAD_PIX_2 data1, data2 vld vr0, a1, 0 vld vr1, a1, FENC_STRIDE vld vr2, a2, 0 vld vr3, a2, FDEC_STRIDE vilvl.b vr0, vr8, vr0 vilvl.b vr1, vr8, vr1 vilvl.b vr2, vr8, vr2 vilvl.b vr3, vr8, vr3 vsub.h \data1, vr0, vr2 vsub.h \data2, vr1, vr3 addi.d a1, a1, FENC_STRIDE * 2 addi.d a2, a2, FDEC_STRIDE * 2 .endm .macro LSX_DCT8_1D LSX_SUMSUB_H vr0, vr8, vr12, vr19 LSX_SUMSUB_H vr1, vr9, vr13, vr18 LSX_SUMSUB_H vr2, vr10, vr14, vr17 LSX_SUMSUB_H vr3, vr11, vr15, vr16 LSX_SUMSUB_H vr4, vr6, vr0, vr3 LSX_SUMSUB_H vr5, vr7, vr1, vr2 vsrai.h vr20, vr8, 1 vadd.h vr20, vr20, vr9 vadd.h vr20, vr20, vr10 vadd.h vr0, vr20, vr8 vsrai.h vr20, vr10, 1 vsub.h vr21, vr8, vr11 vsub.h vr21, vr21, vr10 vsub.h vr1, vr21, vr20 vsrai.h vr20, vr9, 1 vadd.h vr21, vr8, vr11 vsub.h vr21, vr21, vr9 vsub.h vr2, vr21, vr20 vsrai.h vr20, vr11, 1 vsub.h vr21, vr9, vr10 vadd.h vr21, vr21, vr11 vadd.h vr3, vr21, vr20 vadd.h vr12, vr4, vr5 vsrai.h vr20, vr3, 2 vadd.h vr13, vr0, vr20 vsrai.h vr20, vr7, 1 vadd.h vr14, vr6, vr20 vsrai.h vr20, vr2, 2 vadd.h vr15, vr1, vr20 vsub.h vr16, vr4, vr5 vsrai.h vr20, vr1, 2 vsub.h vr17, vr2, vr20 vsrai.h vr20, vr6, 1 vsub.h vr18, vr20, vr7 vsrai.h vr20, vr0, 2 vsub.h vr19, vr20, vr3 .endm /* * void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 ) */ function_x264 sub8x8_dct8_lsx vxor.v vr8, vr0, vr0 // vr12 ... vr19 LSX_LOAD_PIX_2 vr12, vr13 LSX_LOAD_PIX_2 vr14, vr15 LSX_LOAD_PIX_2 vr16, vr17 LSX_LOAD_PIX_2 vr18, vr19 LSX_DCT8_1D LSX_TRANSPOSE8x8_H vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_DCT8_1D vst vr12, a0, 0 vst vr13, a0, 16 vst vr14, a0, 32 vst vr15, a0, 48 vst vr16, a0, 64 vst vr17, a0, 80 vst vr18, a0, 96 vst vr19, a0, 112 endfunc_x264 .macro LASX_LOAD_PIX_2 data1, data2 xvld xr0, a1, 0 xvld xr1, a1, FENC_STRIDE xvld xr2, a2, 0 xvld xr3, a2, FDEC_STRIDE xvpermi.d xr0, xr0, 0x50 xvpermi.d xr1, xr1, 0x50 xvpermi.d xr2, xr2, 0x50 xvpermi.d xr3, xr3, 0x50 xvxor.v xr4, xr0, xr0 xvilvl.b xr0, xr4, xr0 xvilvl.b xr1, xr4, xr1 xvilvl.b xr2, xr4, xr2 xvilvl.b xr3, xr4, xr3 xvsub.h \data1, xr0, xr2 xvsub.h \data2, xr1, xr3 addi.d a1, a1, FENC_STRIDE * 2 addi.d a2, a2, FDEC_STRIDE * 2 .endm .macro LASX_SUMSUB_H sum, diff, a, b xvadd.h \sum, \a, \b xvsub.h \diff, \a, \b .endm .macro LASX_DCT8_1D LASX_SUMSUB_H xr0, xr8, xr12, xr19 LASX_SUMSUB_H xr1, xr9, xr13, xr18 LASX_SUMSUB_H xr2, xr10, xr14, xr17 LASX_SUMSUB_H xr3, xr11, xr15, xr16 LASX_SUMSUB_H xr4, xr6, xr0, xr3 LASX_SUMSUB_H xr5, xr7, xr1, xr2 xvsrai.h xr20, xr8, 1 xvadd.h xr20, xr20, xr9 xvadd.h xr20, xr20, xr10 xvadd.h xr0, xr20, xr8 xvsrai.h xr20, xr10, 1 xvsub.h xr21, xr8, xr11 xvsub.h xr21, xr21, xr10 xvsub.h xr1, xr21, xr20 xvsrai.h xr20, xr9, 1 xvadd.h xr21, xr8, xr11 xvsub.h xr21, xr21, xr9 xvsub.h xr2, xr21, xr20 xvsrai.h xr20, xr11, 1 xvsub.h xr21, xr9, xr10 xvadd.h xr21, xr21, xr11 xvadd.h xr3, xr21, xr20 xvadd.h xr12, xr4, xr5 xvsrai.h xr20, xr3, 2 xvadd.h xr13, xr0, xr20 xvsrai.h xr20, xr7, 1 xvadd.h xr14, xr6, xr20 xvsrai.h xr20, xr2, 2 xvadd.h xr15, xr1, xr20 xvsub.h xr16, xr4, xr5 xvsrai.h xr20, xr1, 2 xvsub.h xr17, xr2, xr20 xvsrai.h xr20, xr6, 1 xvsub.h xr18, xr20, xr7 xvsrai.h xr20, xr0, 2 xvsub.h xr19, xr20, xr3 .endm .macro SUB16x8_DCT8_LASX LASX_LOAD_PIX_2 xr12, xr13 LASX_LOAD_PIX_2 xr14, xr15 LASX_LOAD_PIX_2 xr16, xr17 LASX_LOAD_PIX_2 xr18, xr19 LASX_DCT8_1D LASX_TRANSPOSE8x8_H xr12, xr13, xr14, xr15, xr16, xr17, xr18, xr19, \ xr12, xr13, xr14, xr15, xr16, xr17, xr18, xr19, \ xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7 LASX_DCT8_1D xmov xr0, xr13 xvpermi.q xr13, xr12, 0x20 xvst xr13, a0, 0 xmov xr1, xr15 xvpermi.q xr15, xr14, 0x20 xvst xr15, a0, 32 xmov xr2, xr17 xvpermi.q xr17, xr16, 0x20 xvst xr17, a0, 64 xmov xr3, xr19 xvpermi.q xr19, xr18, 0x20 xvst xr19, a0, 96 xvpermi.q xr12, xr0, 0x13 xvpermi.q xr14, xr1, 0x13 xvpermi.q xr16, xr2, 0x13 xvpermi.q xr18, xr3, 0x13 xvst xr12, a0, 128 xvst xr14, a0, 160 xvst xr16, a0, 192 xvst xr18, a0, 224 .endm /* * void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ) */ function_x264 sub16x16_dct8_lasx move t1, a1 move t3, a2 SUB16x8_DCT8_LASX addi.d a0, a0, 256 addi.d a1, t1, FENC_STRIDE * 8 addi.d a2, t3, FDEC_STRIDE * 8 SUB16x8_DCT8_LASX endfunc_x264 .macro LSX_LOAD_PIX_22 data1, data2, data3, data4 vld vr0, a1, 0 vld vr4, a1, 16 vld vr1, a1, FENC_STRIDE vld vr5, a1, FENC_STRIDE + 16 vld vr2, a2, 0 vld vr6, a2, 16 vld vr3, a2, FDEC_STRIDE vld vr7, a2, FDEC_STRIDE + 16 vpermi.w vr8, vr0, 0x0E vpermi.w vr0, vr0, 0x44 vpermi.w vr8, vr8, 0x44 vpermi.w vr9, vr1, 0x0E vpermi.w vr1, vr1, 0x44 vpermi.w vr9, vr9, 0x44 vpermi.w vr10, vr2, 0x0E vpermi.w vr2, vr2, 0x44 vpermi.w vr10, vr10, 0x44 vpermi.w vr11, vr3, 0x0E vpermi.w vr3, vr3, 0x44 vpermi.w vr11, vr11, 0x44 vxor.v vr30, vr0, vr0 vxor.v vr31, vr8, vr8 vilvl.b vr0, vr30, vr0 vilvl.b vr8, vr31, vr8 vilvl.b vr1, vr30, vr1 vilvl.b vr9, vr31, vr9 vilvl.b vr2, vr30, vr2 vilvl.b vr10, vr31, vr10 vilvl.b vr3, vr30, vr3 vilvl.b vr11, vr31, vr11 vsub.h \data1, vr0, vr2 vsub.h \data3, vr8, vr10 vsub.h \data2, vr1, vr3 vsub.h \data4, vr9, vr11 addi.d a1, a1, FENC_STRIDE * 2 addi.d a2, a2, FDEC_STRIDE * 2 .endm .macro SUB16x8_DCT8_LSX LSX_LOAD_PIX_22 vr12, vr13, vr22, vr23 LSX_LOAD_PIX_22 vr14, vr15, vr24, vr25 LSX_LOAD_PIX_22 vr16, vr17, vr26, vr27 LSX_LOAD_PIX_22 vr18, vr19, vr28, vr29 LSX_DCT8_1D LSX_TRANSPOSE8x8_H vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_DCT8_1D vst vr12, a0, 0 vst vr13, a0, 16 vst vr14, a0, 32 vst vr15, a0, 48 vst vr16, a0, 64 vst vr17, a0, 80 vst vr18, a0, 96 vst vr19, a0, 112 vmov vr12, vr22 vmov vr13, vr23 vmov vr14, vr24 vmov vr15, vr25 vmov vr16, vr26 vmov vr17, vr27 vmov vr18, vr28 vmov vr19, vr29 LSX_DCT8_1D LSX_TRANSPOSE8x8_H vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 LSX_DCT8_1D vst vr12, a0, 128 vst vr13, a0, 144 vst vr14, a0, 160 vst vr15, a0, 176 vst vr16, a0, 192 vst vr17, a0, 208 vst vr18, a0, 224 vst vr19, a0, 240 .endm function_x264 sub16x16_dct8_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 move t1, a1 move t3, a2 SUB16x8_DCT8_LSX addi.d a0, a0, 256 addi.d a1, t1, FENC_STRIDE * 8 addi.d a2, t3, FDEC_STRIDE * 8 SUB16x8_DCT8_LSX fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) */ function_x264 zigzag_scan_4x4_frame_lasx xvld xr1, a1, 0 xvor.v xr2, xr1, xr1 xvpermi.q xr2, xr2, 0x13 xvpermi.q xr1, xr1, 0x02 la.local t0, zigzag_scan4 xvld xr3, t0, 0 xvshuf.h xr3, xr2, xr1 xvst xr3, a0, 0 endfunc_x264 function_x264 zigzag_scan_4x4_frame_lsx vld vr1, a1, 0 vld vr2, a1, 16 vor.v vr3, vr1, vr1 vor.v vr4, vr2, vr2 la.local t0, zigzag_scan4 vld vr5, t0, 0 vld vr6, t0, 16 vshuf.h vr5, vr4, vr1 vshuf.h vr6, vr4, vr1 vst vr5, a0, 0 vst vr6, a0, 16 endfunc_x264 x264-master/common/loongarch/dct.h000066400000000000000000000115311502133446700172470ustar00rootroot00000000000000/***************************************************************************** * dct.h: loongarch transform and zigzag ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Peng Zhou * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_LOONGARCH_DCT_H #define X264_LOONGARCH_DCT_H #define x264_sub8x8_dct_lasx x264_template(sub8x8_dct_lasx) void x264_sub8x8_dct_lasx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub16x16_dct_lasx x264_template(sub16x16_dct_lasx) void x264_sub16x16_dct_lasx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx) void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 ); #define x264_sub16x16_dct8_lasx x264_template(sub16x16_dct8_lasx) void x264_sub16x16_dct8_lasx( int16_t pi_dct[4][64], uint8_t *p_pix1, uint8_t *p_pix2 ); #define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx) void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] ); #define x264_add8x8_idct_lasx x264_template(add8x8_idct_lasx) void x264_add8x8_idct_lasx( uint8_t *p_dst, int16_t pi_dct[4][16] ); #define x264_add16x16_idct_lasx x264_template(add16x16_idct_lasx) void x264_add16x16_idct_lasx( uint8_t *p_dst, int16_t pi_dct[16][16] ); #define x264_add8x8_idct8_lasx x264_template(add8x8_idct8_lasx) void x264_add8x8_idct8_lasx( uint8_t *p_dst, int16_t pi_dct[64] ); #define x264_add8x8_idct_dc_lasx x264_template(add8x8_idct_dc_lasx) void x264_add8x8_idct_dc_lasx( uint8_t *p_dst, int16_t dct[4] ); #define x264_add16x16_idct_dc_lasx x264_template(add16x16_idct_dc_lasx) void x264_add16x16_idct_dc_lasx( uint8_t *p_dst, int16_t dct[16] ); #define x264_idct4x4dc_lasx x264_template(idct4x4dc_lasx) void x264_idct4x4dc_lasx( int16_t d[16] ); #define x264_dct4x4dc_lasx x264_template(dct4x4dc_lasx) void x264_dct4x4dc_lasx( int16_t d[16] ); #define x264_zigzag_scan_4x4_frame_lasx x264_template(zigzag_scan_4x4_frame_lasx) void x264_zigzag_scan_4x4_frame_lasx( int16_t level[16], int16_t dct[16] ); #define x264_sub4x4_dct_lsx x264_template(sub4x4_dct_lsx) void x264_sub4x4_dct_lsx( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub8x8_dct_lsx x264_template(sub8x8_dct_lsx) void x264_sub8x8_dct_lsx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub16x16_dct_lsx x264_template(sub16x16_dct_lsx) void x264_sub16x16_dct_lsx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx) void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 ); #define x264_sub16x16_dct8_lsx x264_template(sub16x16_dct8_lsx) void x264_sub16x16_dct8_lsx( int16_t pi_dct[4][64], uint8_t *p_pix1, uint8_t *p_pix2 ); #define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx) void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] ); #define x264_add8x8_idct_lsx x264_template(add8x8_idct_lsx) void x264_add8x8_idct_lsx( uint8_t *p_dst, int16_t pi_dct[4][16] ); #define x264_add16x16_idct_lsx x264_template(add16x16_idct_lsx) void x264_add16x16_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16][16] ); #define x264_add8x8_idct8_lsx x264_template(add8x8_idct8_lsx) void x264_add8x8_idct8_lsx( uint8_t *p_dst, int16_t pi_dct[64] ); #define x264_add8x8_idct_dc_lsx x264_template(add8x8_idct_dc_lsx) void x264_add8x8_idct_dc_lsx( uint8_t *p_dst, int16_t dct[4] ); #define x264_add16x16_idct_dc_lsx x264_template(add16x16_idct_dc_lsx) void x264_add16x16_idct_dc_lsx( uint8_t *p_dst, int16_t dct[16] ); #define x264_idct4x4dc_lsx x264_template(idct4x4dc_lsx) void x264_idct4x4dc_lsx( int16_t d[16] ); #define x264_dct4x4dc_lsx x264_template(dct4x4dc_lsx) void x264_dct4x4dc_lsx( int16_t d[16] ); #define x264_zigzag_scan_4x4_frame_lsx x264_template(zigzag_scan_4x4_frame_lsx) void x264_zigzag_scan_4x4_frame_lsx( int16_t level[16], int16_t dct[16] ); #endif x264-master/common/loongarch/deblock-a.S000066400000000000000000001610431502133446700202750ustar00rootroot00000000000000/***************************************************************************** * deblock-a.S: loongarch deblock functions ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Hao Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" #if !HIGH_BIT_DEPTH const shuf_loc_locn .byte 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28 .byte 16, 24, 0, 8, 17, 25, 1, 9, 18, 26, 2, 10, 19, 27, 3, 11 endconst const shuf_locn .byte 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 endconst /*Transpose 16 * 6 block with byte elements in vectors*/ .macro LASX_TRANSPOSE in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15,\ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,\ out0, out1, out2, out3, out4, out5 xvilvl.b \tmp0, \in1, \in0 xvilvl.b \tmp1, \in3, \in2 xvilvl.b \tmp2, \in5, \in4 xvilvl.b \tmp3, \in7, \in6 xvilvl.b \tmp4, \in9, \in8 xvilvl.b \tmp5, \in11, \in10 xvilvl.b \tmp6, \in13, \in12 xvilvl.b \tmp7, \in15, \in14 xvpermi.d \tmp0, \tmp0, 0xD8 xvpermi.d \tmp1, \tmp1, 0xD8 xvpermi.d \tmp2, \tmp2, 0xD8 xvpermi.d \tmp3, \tmp3, 0xD8 xvpermi.d \tmp4, \tmp4, 0xD8 xvpermi.d \tmp5, \tmp5, 0xD8 xvpermi.d \tmp6, \tmp6, 0xD8 xvpermi.d \tmp7, \tmp7, 0xD8 xvilvl.h \out0, \tmp1, \tmp0 xvilvl.h \out1, \tmp3, \tmp2 xvilvl.h \out2, \tmp5, \tmp4 xvilvl.h \out3, \tmp7, \tmp6 xvilvl.w \tmp0, \out1, \out0 xvilvh.w \tmp1, \out1, \out0 xvilvl.w \tmp2, \out3, \out2 xvilvh.w \tmp3, \out3, \out2 xvilvl.d \out0, \tmp2, \tmp0 xvilvh.d \out1, \tmp2, \tmp0 xvilvl.d \out2, \tmp3, \tmp1 xvilvh.d \out3, \tmp3, \tmp1 xvpermi.d \out4, \out0, 0x4E xvpermi.d \out5, \out1, 0x4E .endm /* * void deblock_h_luma_lasx(Pixel *pix, intptr_t stride, int alpha, * int beta, int8_t *tc0) */ function_x264 deblock_h_luma_lasx slli.d t0, a1, 1 slli.d t2, a1, 2 xvldrepl.w xr1, a4, 0 add.d t1, t0, a1 xvreplgr2vr.b xr2, a3 xvilvl.b xr1, xr1, xr1 // Store registers to the stack addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 // Load data from pix addi.d t4, a0, -3 FLDD_LOADX_4 t4, a1, t0, t1, f10, f11, f12, f13 add.d t5, t4, t2 FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23 add.d t6, t5, t2 FLDD_LOADX_4 t6, a1, t0, t1, f24, f25, f26, f27 LASX_TRANSPOSE xr10, xr11, xr12, xr13, xr14, xr15, xr16, xr17, \ xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27, \ xr8, xr9, xr18, xr19, xr28, xr29, xr30, xr31, \ xr10, xr11, xr12, xr13, xr14, xr15 xvilvl.h xr1, xr1, xr1 vext2xv.hu.bu xr20, xr10 vext2xv.hu.bu xr21, xr11 vext2xv.hu.bu xr22, xr12 vext2xv.hu.bu xr23, xr13 vext2xv.hu.bu xr24, xr14 vext2xv.hu.bu xr25, xr15 vext2xv.h.b xr3, xr1 xvadd.h xr26, xr22, xr23 xvsrari.h xr26, xr26, 1 xvneg.h xr4, xr3 xvadd.h xr27, xr20, xr26 xvadd.h xr28, xr25, xr26 xvsub.h xr29, xr23, xr22 xvsrai.h xr27, xr27, 1 xvsrai.h xr28, xr28, 1 xvslli.h xr29, xr29, 2 xvsub.h xr30, xr21, xr24 xvsub.h xr27, xr27, xr21 xvsub.h xr28, xr28, xr24 xvadd.h xr29, xr29, xr30 xvclip.h xr27, xr27, xr4, xr3 xvclip.h xr28, xr28, xr4, xr3 xvpickev.b xr16, xr25, xr20 xvpickev.b xr17, xr23, xr22 xvabsd.bu xr5, xr16, xr17 xvaddi.hu xr6, xr3, 1 xvslt.bu xr5, xr5, xr2 xvilvl.b xr30, xr5, xr5 xvilvh.b xr31, xr5, xr5 xvbitsel.v xr3, xr3, xr6, xr30 xvsrari.h xr29, xr29, 3 xvaddi.hu xr6, xr3, 1 xvbitsel.v xr3, xr3, xr6, xr31 xvneg.h xr4, xr3 xvclip.h xr29, xr29, xr4, xr3 xvadd.h xr30, xr21, xr27 xvadd.h xr18, xr24, xr28 xvadd.h xr19, xr22, xr29 xvsub.h xr26, xr23, xr29 xvssrarni.bu.h xr26, xr19, 0 xvpickev.b xr25, xr18, xr30 xvpickev.b xr27, xr24, xr21 xvpickev.b xr28, xr23, xr22 xvpickev.b xr18, xr22, xr21 xvabsd.bu xr19, xr18, xr17 xvreplgr2vr.b xr30, a2 xvilvl.d xr31, xr30, xr2 xvabsd.bu xr20, xr14, xr13 xvslt.bu xr19, xr19, xr31 xvslt.bu xr20, xr20, xr2 xvbitsel.v xr25, xr27, xr25, xr5 xvpermi.d xr20, xr20, 0x50 xvand.v xr21, xr20, xr19 xvpermi.d xr7, xr21, 0xB1 xvand.v xr21, xr21, xr7 xvbitsel.v xr25, xr27, xr25, xr21 xvpermi.d xr1, xr1, 0x50 xvbitsel.v xr26, xr28, xr26, xr21 xvslti.b xr30, xr1, 0 xvbitsel.v xr25, xr25, xr27, xr30 xvbitsel.v xr26, xr26, xr28, xr30 xvilvl.b xr10, xr26, xr25 xvilvh.b xr20, xr25, xr26 xvilvl.h xr21, xr20, xr10 xvilvh.h xr22, xr20, xr10 // Store data to pix addi.d t5, a0, -2 xvstelm.w xr21, t5, 0, 0 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 1 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 2 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 3 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 0 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 1 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 2 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 3 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 4 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 5 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 6 add.d t5, t5, a1 xvstelm.w xr21, t5, 0, 7 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 4 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 5 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 6 add.d t5, t5, a1 xvstelm.w xr22, t5, 0, 7 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void deblock_v_luma_lasx(Pixel *pix, intptr_t stride, * int alpha, int beta, int8_t *tc0) */ function_x264 deblock_v_luma_lasx slli.d t0, a1, 1 // Load data from tc0 xvldrepl.w xr1, a4, 0 add.d t1, t0, a1 xvreplgr2vr.b xr2, a3 xvilvl.b xr1, xr1, xr1 // Load data from pix sub.d t5, a0, t1 vld vr10, t5, 0 vldx vr11, t5, a1 vldx vr12, t5, t0 vld vr13, a0, 0 vldx vr14, a0, a1 vldx vr15, a0, t0 // Store registers to the stack addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 xvilvl.h xr1, xr1, xr1 vext2xv.hu.bu xr20, xr10 vext2xv.hu.bu xr21, xr11 vext2xv.hu.bu xr22, xr12 vext2xv.hu.bu xr23, xr13 vext2xv.hu.bu xr24, xr14 vext2xv.hu.bu xr25, xr15 vext2xv.h.b xr3, xr1 xvadd.h xr26, xr22, xr23 xvsrari.h xr26, xr26, 1 xvneg.h xr4, xr3 xvadd.h xr27, xr20, xr26 xvadd.h xr28, xr25, xr26 xvsub.h xr29, xr23, xr22 xvsrai.h xr27, xr27, 1 xvsrai.h xr28, xr28, 1 xvslli.h xr29, xr29, 2 xvsub.h xr30, xr21, xr24 xvsub.h xr27, xr27, xr21 xvsub.h xr28, xr28, xr24 xvadd.h xr29, xr29, xr30 xvclip.h xr27, xr27, xr4, xr3 xvclip.h xr28, xr28, xr4, xr3 xvpickev.b xr16, xr25, xr20 xvpickev.b xr17, xr23, xr22 xvabsd.bu xr5, xr16, xr17 xvaddi.hu xr6, xr3, 1 xvslt.bu xr5, xr5, xr2 xvilvl.b xr30, xr5, xr5 xvilvh.b xr31, xr5, xr5 xvbitsel.v xr3, xr3, xr6, xr30 xvsrari.h xr29, xr29, 3 xvaddi.hu xr6, xr3, 1 xvbitsel.v xr3, xr3, xr6, xr31 xvneg.h xr4, xr3 xvclip.h xr29, xr29, xr4, xr3 xvadd.h xr30, xr21, xr27 xvadd.h xr18, xr24, xr28 xvadd.h xr19, xr22, xr29 xvsub.h xr26, xr23, xr29 xvssrarni.bu.h xr26, xr19, 0 xvpickev.b xr25, xr18, xr30 xvpickev.b xr27, xr24, xr21 xvpickev.b xr28, xr23, xr22 xvpickev.b xr18, xr22, xr21 xvabsd.bu xr19, xr18, xr17 xvreplgr2vr.b xr30, a2 xvilvl.d xr31, xr30, xr2 xvabsd.bu xr20, xr14, xr13 xvslt.bu xr19, xr19, xr31 xvslt.bu xr20, xr20, xr2 xvbitsel.v xr25, xr27, xr25, xr5 xvpermi.d xr20, xr20, 0x50 xvand.v xr21, xr20, xr19 xvpermi.d xr7, xr21, 0xB1 xvand.v xr21, xr21, xr7 xvbitsel.v xr25, xr27, xr25, xr21 xvpermi.d xr1, xr1, 0x50 xvbitsel.v xr26, xr28, xr26, xr21 xvslti.b xr30, xr1, 0 xvbitsel.v xr25, xr25, xr27, xr30 xvbitsel.v xr26, xr26, xr28, xr30 sub.d t5, a0, t0 xvpermi.d xr0, xr25, 0xd8 xvpermi.d xr1, xr26, 0xd8 xvpermi.d xr2, xr26, 0x8D xvpermi.d xr3, xr25, 0x8D // Store data to pix vst vr0, t5, 0 vstx vr1, t5, a1 vst vr2, a0, 0 vstx vr3, a0, a1 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void deblock_v_luma_intra_lasx(Pixel *pix, intptr_t stride, * int alpha, int beta) */ function_x264 deblock_v_luma_intra_lasx slli.d t0, a1, 1 slli.d t2, a1, 2 add.d t1, t0, a1 // Load data from pix sub.d t5, a0, t2 vld vr9, t5, 0 vldx vr10, t5, a1 vldx vr11, t5, t0 vldx vr12, t5, t1 vld vr13, a0, 0 vldx vr14, a0, a1 vldx vr15, a0, t0 vldx vr16, a0, t1 // Store registers to the stack addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 xvreplgr2vr.b xr1, a2 xvreplgr2vr.b xr2, a3 vext2xv.hu.bu xr19, xr9 vext2xv.hu.bu xr20, xr10 vext2xv.hu.bu xr21, xr11 vext2xv.hu.bu xr22, xr12 vext2xv.hu.bu xr23, xr13 vext2xv.hu.bu xr24, xr14 vext2xv.hu.bu xr25, xr15 vext2xv.hu.bu xr26, xr16 xvadd.h xr27, xr21, xr22 xvadd.h xr29, xr19, xr20 xvadd.h xr3, xr27, xr23 xvadd.h xr6, xr27, xr24 xvadd.h xr4, xr3, xr20 xvslli.h xr29, xr29, 1 xvadd.h xr5, xr6, xr4 xvadd.h xr6, xr6, xr21 xvadd.h xr5, xr5, xr23 xvadd.h xr7, xr29, xr4 xvsrari.h xr3, xr4, 2 xvsrari.h xr6, xr6, 2 xvsrari.h xr4, xr5, 3 xvadd.h xr27, xr24, xr23 xvadd.h xr28, xr26, xr25 xvsrari.h xr5, xr7, 3 xvadd.h xr29, xr22, xr27 xvslli.h xr28, xr28, 1 xvadd.h xr7, xr29, xr25 xvadd.h xr17, xr27, xr21 xvadd.h xr8, xr7, xr28 xvadd.h xr18, xr17, xr7 xvadd.h xr17, xr17, xr24 xvadd.h xr18, xr18, xr22 xvsrari.h xr7, xr7, 2 xvsrari.h xr8, xr8, 3 xvsrari.h xr18, xr18, 3 xvsrari.h xr17, xr17, 2 xvpickev.b xr27, xr25, xr20 xvpickev.b xr28, xr24, xr21 xvpickev.b xr29, xr23, xr22 xvpickev.b xr9, xr8, xr5 xvpickev.b xr16, xr7, xr3 xvabsd.bu xr30, xr27, xr29 xvpickev.b xr19, xr18, xr4 xvpickev.b xr26, xr17, xr6 xvslt.bu xr31, xr30, xr2 xvabsd.bu xr20, xr12, xr13 xvabsd.bu xr21, xr11, xr12 xvabsd.bu xr22, xr14, xr13 xvsrli.b xr0, xr1, 2 xvbitsel.v xr19, xr26, xr19, xr31 xvbitsel.v xr9, xr27, xr9, xr31 xvbitsel.v xr16, xr28, xr16, xr31 xvaddi.bu xr0, xr0, 2 xvpermi.d xr20, xr20, 0x50 xvpermi.d xr21, xr21, 0x50 xvpermi.d xr22, xr22, 0x50 xvslt.bu xr10, xr20, xr0 xvslt.bu xr11, xr20, xr1 xvslt.bu xr12, xr21, xr2 xvslt.bu xr13, xr22, xr2 xvand.v xr30, xr11, xr12 xvand.v xr30, xr30, xr13 xvbitsel.v xr9, xr27, xr9, xr10 xvbitsel.v xr16, xr28, xr16, xr10 xvbitsel.v xr19, xr26, xr19, xr10 xvbitsel.v xr9, xr27, xr9, xr30 xvbitsel.v xr16, xr28, xr16, xr30 xvbitsel.v xr19, xr29, xr19, xr30 xvpermi.d xr1, xr9, 0xD8 xvpermi.d xr2, xr16, 0xD8 xvpermi.d xr3, xr19, 0xD8 xvpermi.d xr4, xr19, 0x8D xvpermi.d xr5, xr16, 0x8D xvpermi.d xr6, xr9, 0x8D // Store data to pix vstx vr1, t5, a1 vstx vr2, t5, t0 vstx vr3, t5, t1 vst vr4, a0, 0 vstx vr5, a0, a1 vstx vr6, a0, t0 // Restore register values fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void deblock_h_luma_intra_lasx(Pixel *pix, intptr_t stride, * int alpha, int beta) */ function_x264 deblock_h_luma_intra_lasx slli.d t0, a1, 1 slli.d t2, a1, 2 addi.d t5, a0, -4 add.d t1, t0, a1 // Store registers to the stack addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 // Load data from pix FLDD_LOADX_4 t5, a1, t0, t1, f10, f11, f12, f13 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f24, f25, f26, f27 LASX_TRANSPOSE16X8_B xr10, xr11, xr12, xr13, xr14, xr15, xr16, xr17, \ xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27, \ xr9, xr10, xr11, xr12, xr13, xr14, xr15, xr16, \ xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7 xvreplgr2vr.b xr1, a2 xvreplgr2vr.b xr2, a3 vext2xv.hu.bu xr19, xr9 vext2xv.hu.bu xr20, xr10 vext2xv.hu.bu xr21, xr11 vext2xv.hu.bu xr22, xr12 vext2xv.hu.bu xr23, xr13 vext2xv.hu.bu xr24, xr14 vext2xv.hu.bu xr25, xr15 vext2xv.hu.bu xr26, xr16 xvadd.h xr27, xr21, xr22 xvadd.h xr29, xr19, xr20 xvadd.h xr3, xr27, xr23 xvadd.h xr6, xr27, xr24 xvadd.h xr4, xr3, xr20 xvslli.h xr29, xr29, 1 xvadd.h xr5, xr6, xr4 xvadd.h xr6, xr6, xr21 xvadd.h xr5, xr5, xr23 xvadd.h xr7, xr29, xr4 xvsrari.h xr3, xr4, 2 xvsrari.h xr6, xr6, 2 xvsrari.h xr4, xr5, 3 xvadd.h xr27, xr24, xr23 xvadd.h xr28, xr26, xr25 xvsrari.h xr5, xr7, 3 xvadd.h xr29, xr22, xr27 xvslli.h xr28, xr28, 1 xvadd.h xr7, xr29, xr25 xvadd.h xr17, xr27, xr21 xvadd.h xr8, xr7, xr28 xvadd.h xr18, xr17, xr7 xvadd.h xr17, xr17, xr24 xvadd.h xr18, xr18, xr22 xvsrari.h xr7, xr7, 2 xvsrari.h xr8, xr8, 3 xvsrari.h xr18, xr18, 3 xvsrari.h xr17, xr17, 2 xvpickev.b xr27, xr25, xr20 xvpickev.b xr28, xr24, xr21 xvpickev.b xr29, xr23, xr22 xvpickev.b xr9, xr8, xr5 xvpickev.b xr16, xr7, xr3 xvabsd.bu xr30, xr27, xr29 xvpickev.b xr19, xr18, xr4 xvpickev.b xr26, xr17, xr6 xvslt.bu xr31, xr30, xr2 xvabsd.bu xr20, xr12, xr13 xvabsd.bu xr21, xr11, xr12 xvabsd.bu xr22, xr14, xr13 xvsrli.b xr0, xr1, 2 xvbitsel.v xr19, xr26, xr19, xr31 xvbitsel.v xr9, xr27, xr9, xr31 xvbitsel.v xr16, xr28, xr16, xr31 xvaddi.bu xr0, xr0, 2 xvpermi.d xr20, xr20, 0x50 xvpermi.d xr21, xr21, 0x50 xvpermi.d xr22, xr22, 0x50 xvslt.bu xr10, xr20, xr0 xvslt.bu xr11, xr20, xr1 xvslt.bu xr12, xr21, xr2 xvslt.bu xr13, xr22, xr2 xvand.v xr30, xr11, xr12 xvand.v xr30, xr30, xr13 xvbitsel.v xr9, xr27, xr9, xr10 xvbitsel.v xr16, xr28, xr16, xr10 xvbitsel.v xr19, xr26, xr19, xr10 xvbitsel.v xr9, xr27, xr9, xr30 xvbitsel.v xr16, xr28, xr16, xr30 xvbitsel.v xr19, xr29, xr19, xr30 xvilvl.b xr0, xr16, xr9 xvpermi.d xr18, xr19, 0xB1 xvilvh.b xr1, xr9, xr16 xvilvl.b xr2, xr18, xr19 addi.d t5, a0, -3 xvilvl.h xr3, xr2, xr0 xvilvh.h xr4, xr2, xr0 // Store data to pix xvstelm.w xr3, t5, 0, 0 xvstelm.h xr1, t5, 4, 0 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 1 xvstelm.h xr1, t5, 4, 1 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 2 xvstelm.h xr1, t5, 4, 2 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 3 xvstelm.h xr1, t5, 4, 3 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 0 xvstelm.h xr1, t5, 4, 4 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 1 xvstelm.h xr1, t5, 4, 5 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 2 xvstelm.h xr1, t5, 4, 6 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 3 xvstelm.h xr1, t5, 4, 7 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 4 xvstelm.h xr1, t5, 4, 8 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 5 xvstelm.h xr1, t5, 4, 9 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 6 xvstelm.h xr1, t5, 4, 10 add.d t5, t5, a1 xvstelm.w xr3, t5, 0, 7 xvstelm.h xr1, t5, 4, 11 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 4 xvstelm.h xr1, t5, 4, 12 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 5 xvstelm.h xr1, t5, 4, 13 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 6 xvstelm.h xr1, t5, 4, 14 add.d t5, t5, a1 xvstelm.w xr4, t5, 0, 7 xvstelm.h xr1, t5, 4, 15 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], * int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], * int mvy_limit, int bframe ) */ function_x264 deblock_strength_lasx // dir = 0 s1 = 8 s2 = 1 vldi vr18, 2 vldi vr19, 1 addi.d t0, zero, 4 xvreplgr2vr.h xr20, t0 xvreplgr2vr.h xr21, a4 xvld xr0, a0, 11 xvpermi.q xr1, xr0, 0x01 la.local t0, shuf_loc_locn xvld xr23, t0, 0 xvshuf.b xr4, xr1, xr0, xr23 xvpermi.q xr5, xr4, 0x01 vor.v vr6, vr4, vr5 vseqi.b vr6, vr6, 0 vmov vr15, vr6 vxor.v vr8, vr8, vr8 vbitsel.v vr8, vr18, vr8, vr6 xvld xr0, a1, 11 xvpermi.q xr1, xr0, 0x01 xvshuf.b xr4, xr1, xr0, xr23 xvpermi.q xr5, xr4, 0x01 vseq.b vr4, vr4, vr5 vseqi.b vr4, vr4, 0 vld vr0, a2, 44 vld vr1, a2, 76 vld vr5, a2, 108 vld vr6, a2, 140 vilvl.h vr9, vr1, vr0 vilvl.h vr10, vr6, vr5 vilvl.w vr11, vr10, vr9 vilvh.w vr12, vr10, vr9 vilvh.h vr9, vr1, vr0 vilvh.h vr10, vr6, vr5 vilvl.w vr13, vr10, vr9 vilvh.w vr14, vr10, vr9 vilvl.d vr0, vr13, vr12 ld.h t0, a2, 60 ld.h t1, a2, 92 ld.h t2, a2, 124 ld.h t3, a2, 156 vmov vr6, vr14 vinsgr2vr.h vr6, t0, 4 vinsgr2vr.h vr6, t1, 5 vinsgr2vr.h vr6, t2, 6 vinsgr2vr.h vr6, t3, 7 vilvl.d vr1, vr12, vr11 vilvl.d vr5, vr14, vr13 xvpermi.q xr0, xr6, 0x02 // mv[0][loc][0] xvpermi.q xr5, xr1, 0x20 // mv[0][locn][0] xvabsd.h xr5, xr0, xr5 xvsle.h xr5, xr20, xr5 vilvh.d vr0, vr13, vr12 ld.h t0, a2, 62 ld.h t1, a2, 94 ld.h t2, a2, 126 ld.h t3, a2, 158 vbsrl.v vr7, vr14, 8 vinsgr2vr.h vr7, t0, 4 vinsgr2vr.h vr7, t1, 5 vinsgr2vr.h vr7, t2, 6 vinsgr2vr.h vr7, t3, 7 vilvh.d vr1, vr12, vr11 vilvh.d vr6, vr14, vr13 xvpermi.q xr0, xr7, 0x02 // mv[0][loc][1] xvpermi.q xr6, xr1, 0x20 // mv[0][locn][1] xvabsd.h xr6, xr0, xr6 xvsle.h xr6, xr21, xr6 xvor.v xr5, xr5, xr6 xvpickev.b xr5, xr5, xr5 xvpermi.d xr5, xr5, 0xd8 vor.v vr17, vr4, vr5 beqz a5, .bframe_iszero_0 // bframe != 0 xvld xr0, a1, 51 xvpermi.q xr1, xr0, 0x01 xvshuf.b xr4, xr1, xr0, xr23 xvpermi.q xr5, xr4, 0x01 vseq.b vr4, vr4, vr5 vseqi.b vr4, vr4, 0 vld vr0, a2, 204 vld vr1, a2, 236 vld vr5, a2, 268 vld vr6, a2, 300 vilvl.h vr9, vr1, vr0 vilvl.h vr10, vr6, vr5 vilvl.w vr11, vr10, vr9 vilvh.w vr12, vr10, vr9 vilvh.h vr9, vr1, vr0 vilvh.h vr10, vr6, vr5 vilvl.w vr13, vr10, vr9 vilvh.w vr14, vr10, vr9 vilvl.d vr0, vr13, vr12 ld.h t0, a2, 220 ld.h t1, a2, 252 ld.h t2, a2, 284 ld.h t3, a2, 316 vmov vr6, vr14 vinsgr2vr.h vr6, t0, 4 vinsgr2vr.h vr6, t1, 5 vinsgr2vr.h vr6, t2, 6 vinsgr2vr.h vr6, t3, 7 vilvl.d vr1, vr12, vr11 vilvl.d vr5, vr14, vr13 xvpermi.q xr0, xr6, 0x02 // mv[1][loc][0] xvpermi.q xr5, xr1, 0x20 // mv[1][locn][0] xvabsd.h xr5, xr0, xr5 xvsle.h xr5, xr20, xr5 vilvh.d vr0, vr13, vr12 ld.h t0, a2, 222 ld.h t1, a2, 254 ld.h t2, a2, 286 ld.h t3, a2, 318 vbsrl.v vr7, vr14, 8 vinsgr2vr.h vr7, t0, 4 vinsgr2vr.h vr7, t1, 5 vinsgr2vr.h vr7, t2, 6 vinsgr2vr.h vr7, t3, 7 vilvh.d vr1, vr12, vr11 vilvh.d vr6, vr14, vr13 xvpermi.q xr0, xr7, 0x02 // mv[1][loc][1] xvpermi.q xr6, xr1, 0x20 // mv[1][locn][1] xvabsd.h xr6, xr0, xr6 xvsle.h xr6, xr21, xr6 xvor.v xr5, xr5, xr6 xvpickev.b xr5, xr5, xr5 xvpermi.d xr5, xr5, 0xd8 vor.v vr5, vr5, vr4 vor.v vr17, vr5, vr17 .bframe_iszero_0: vxor.v vr22, vr22, vr22 vbitsel.v vr22, vr22, vr19, vr17 vbitsel.v vr22, vr8, vr22, vr15 vst vr22, a3, 0 // dir = 1 s1 = 1 s2 = 8 vld vr0, a0, 4 vld vr1, a0, 20 ld.wu t0, a0, 36 vpickev.w vr2, vr1, vr0 vbsrl.v vr3, vr2, 4 vinsgr2vr.w vr3, t0, 3 vor.v vr2, vr3, vr2 vseqi.b vr2, vr2, 0 vmov vr15, vr2 vxor.v vr3, vr3, vr3 vbitsel.v vr3, vr18, vr3, vr2 vld vr0, a1, 4 vld vr1, a1, 20 ld.w t0, a1, 36 vpickev.w vr2, vr1, vr0 vbsrl.v vr4, vr2, 4 vinsgr2vr.w vr4, t0, 3 vseq.b vr2, vr4, vr2 vseqi.b vr2, vr2, 0 vld vr0, a2, 16 vld vr1, a2, 48 vld vr12, a2, 80 vld vr13, a2, 112 vld vr4, a2, 144 vpickev.h vr5, vr1, vr0 vpickev.h vr14, vr13, vr12 xvpermi.q xr5, xr14, 0x02 // mv[0][locn][0] vpickev.h vr7, vr4, vr4 xvpermi.d xr6, xr5, 0x39 xvinsve0.d xr6, xr7, 3 // mv[0][loc][0] xvabsd.h xr5, xr6, xr5 xvsle.h xr5, xr20, xr5 vpickod.h vr6, vr1, vr0 vpickod.h vr14, vr13, vr12 xvpermi.q xr6, xr14, 0x02 // mv[0][locn][1] vpickod.h vr7, vr4, vr4 xvpermi.d xr8, xr6, 0x39 xvinsve0.d xr8, xr7, 3 // mv[0][loc][1] xvabsd.h xr6, xr8, xr6 xvsle.h xr6, xr21, xr6 xvor.v xr5, xr6, xr5 xvpickev.b xr6, xr5, xr5 xvpermi.d xr6, xr6, 0xd8 vor.v vr2, vr6, vr2 beqz a5, .bframe_iszero_1 // bframe != 0 ref[1] vld vr0, a1, 44 vld vr1, a1, 60 ld.w t0, a1, 76 vpickev.w vr0, vr1, vr0 vbsrl.v vr1, vr0, 4 vinsgr2vr.w vr1, t0, 3 vseq.b vr11, vr1, vr0 vseqi.b vr11, vr11, 0 vld vr0, a2, 176 vld vr1, a2, 208 vld vr12, a2, 240 vld vr13, a2, 272 vld vr4, a2, 304 vpickev.h vr5, vr1, vr0 vpickev.h vr14, vr13, vr12 xvpermi.q xr5, xr14, 0x02 // mv[1][locn][0] vpickev.h vr7, vr4, vr4 xvpermi.d xr6, xr5, 0x39 xvinsve0.d xr6, xr7, 3 // mv[1][loc][0] xvabsd.h xr5, xr6, xr5 xvsle.h xr5, xr20, xr5 vpickod.h vr6, vr1, vr0 vpickod.h vr14, vr13, vr12 xvpermi.q xr6, xr14, 0x02 // mv[1][locn][1] vpickod.h vr7, vr4, vr4 xvpermi.d xr8, xr6, 0x39 xvinsve0.d xr8, xr7, 3 // mv[1][loc][1] xvabsd.h xr6, xr8, xr6 xvsle.h xr6, xr21, xr6 xvor.v xr5, xr6, xr5 xvpickev.b xr6, xr5, xr5 xvpermi.d xr6, xr6, 0xd8 vor.v vr6, vr6, vr11 vor.v vr2, vr6, vr2 .bframe_iszero_1: vxor.v vr22, vr22, vr22 vbitsel.v vr22, vr22, vr19, vr2 vbitsel.v vr22, vr3, vr22, vr15 vst vr22, a3, 32 endfunc_x264 /* * void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], * int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], * int mvy_limit, int bframe ) */ function_x264 deblock_strength_lsx // dir = 0 s1 = 8 s2 = 1 vldi vr18, 2 vldi vr19, 1 addi.d t0, zero, 4 vreplgr2vr.h vr20, t0 vreplgr2vr.h vr21, a4 vld vr0, a0, 11 vld vr1, a0, 27 la.local t0, shuf_loc_locn la.local t1, shuf_locn vld vr2, t0, 0 vld vr3, t1, 0 vshuf.b vr4, vr1, vr0, vr2 vshuf.b vr5, vr1, vr0, vr3 vor.v vr6, vr4, vr5 vseqi.b vr6, vr6, 0 vmov vr15, vr6 vxor.v vr8, vr8, vr8 vbitsel.v vr8, vr18, vr8, vr6 vld vr0, a1, 11 vld vr1, a1, 27 vshuf.b vr4, vr1, vr0, vr2 vshuf.b vr5, vr1, vr0, vr3 vseq.b vr4, vr4, vr5 vseqi.b vr4, vr4, 0 vld vr0, a2, 44 vld vr1, a2, 76 vld vr5, a2, 108 vld vr6, a2, 140 vilvl.h vr9, vr1, vr0 vilvl.h vr10, vr6, vr5 vilvl.w vr11, vr10, vr9 vilvh.w vr12, vr10, vr9 vilvh.h vr9, vr1, vr0 vilvh.h vr10, vr6, vr5 vilvl.w vr13, vr10, vr9 vilvh.w vr14, vr10, vr9 vilvl.d vr0, vr13, vr12 ld.h t0, a2, 60 ld.h t1, a2, 92 ld.h t2, a2, 124 ld.h t3, a2, 156 vmov vr6, vr14 vinsgr2vr.h vr6, t0, 4 vinsgr2vr.h vr6, t1, 5 vinsgr2vr.h vr6, t2, 6 vinsgr2vr.h vr6, t3, 7 vilvl.d vr1, vr12, vr11 vilvl.d vr5, vr14, vr13 vabsd.h vr9, vr0, vr1 vabsd.h vr5, vr6, vr5 vsle.h vr9, vr20, vr9 vsle.h vr5, vr20, vr5 vilvh.d vr0, vr13, vr12 ld.h t0, a2, 62 ld.h t1, a2, 94 ld.h t2, a2, 126 ld.h t3, a2, 158 vbsrl.v vr7, vr14, 8 vinsgr2vr.h vr7, t0, 4 vinsgr2vr.h vr7, t1, 5 vinsgr2vr.h vr7, t2, 6 vinsgr2vr.h vr7, t3, 7 vilvh.d vr1, vr12, vr11 vilvh.d vr6, vr14, vr13 vabsd.h vr0, vr0, vr1 vabsd.h vr6, vr7, vr6 vsle.h vr0, vr21, vr0 vsle.h vr6, vr21, vr6 vor.v vr9, vr9, vr0 vor.v vr5, vr5, vr6 vpickev.b vr5, vr5, vr9 vor.v vr17, vr4, vr5 beqz a5, .bframeiszero_0_lsx // bframe != 0 vld vr0, a1, 51 vld vr1, a1, 67 vshuf.b vr4, vr1, vr0, vr2 vshuf.b vr5, vr1, vr0, vr3 vseq.b vr4, vr4, vr5 vseqi.b vr4, vr4, 0 vld vr0, a2, 204 vld vr1, a2, 236 vld vr5, a2, 268 vld vr6, a2, 300 vilvl.h vr9, vr1, vr0 vilvl.h vr10, vr6, vr5 vilvl.w vr11, vr10, vr9 vilvh.w vr12, vr10, vr9 vilvh.h vr9, vr1, vr0 vilvh.h vr10, vr6, vr5 vilvl.w vr13, vr10, vr9 vilvh.w vr14, vr10, vr9 vilvl.d vr0, vr13, vr12 ld.h t0, a2, 220 ld.h t1, a2, 252 ld.h t2, a2, 284 ld.h t3, a2, 316 vmov vr6, vr14 vinsgr2vr.h vr6, t0, 4 vinsgr2vr.h vr6, t1, 5 vinsgr2vr.h vr6, t2, 6 vinsgr2vr.h vr6, t3, 7 vilvl.d vr1, vr12, vr11 vilvl.d vr5, vr14, vr13 vabsd.h vr9, vr0, vr1 vabsd.h vr5, vr6, vr5 vsle.h vr9, vr20, vr9 vsle.h vr5, vr20, vr5 vilvh.d vr0, vr13, vr12 ld.h t0, a2, 222 ld.h t1, a2, 254 ld.h t2, a2, 286 ld.h t3, a2, 318 vbsrl.v vr7, vr14, 8 vinsgr2vr.h vr7, t0, 4 vinsgr2vr.h vr7, t1, 5 vinsgr2vr.h vr7, t2, 6 vinsgr2vr.h vr7, t3, 7 vilvh.d vr1, vr12, vr11 vilvh.d vr6, vr14, vr13 vabsd.h vr0, vr0, vr1 vabsd.h vr6, vr7, vr6 vsle.h vr0, vr21, vr0 vsle.h vr6, vr21, vr6 vor.v vr9, vr9, vr0 vor.v vr5, vr5, vr6 vpickev.b vr5, vr5, vr9 vor.v vr5, vr5, vr4 vor.v vr17, vr5, vr17 .bframeiszero_0_lsx: vxor.v vr22, vr22, vr22 vbitsel.v vr22, vr22, vr19, vr17 vbitsel.v vr22, vr8, vr22, vr15 vst vr22, a3, 0 // dir = 1 s1 = 1 s2 = 8 vld vr0, a0, 4 vld vr1, a0, 20 ld.wu t0, a0, 36 vpickev.w vr2, vr1, vr0 vbsrl.v vr3, vr2, 4 vinsgr2vr.w vr3, t0, 3 vor.v vr2, vr3, vr2 vseqi.b vr2, vr2, 0 vmov vr15, vr2 vxor.v vr3, vr3, vr3 vbitsel.v vr3, vr18, vr3, vr2 vld vr0, a1, 4 vld vr1, a1, 20 ld.w t0, a1, 36 vpickev.w vr2, vr1, vr0 vbsrl.v vr4, vr2, 4 vinsgr2vr.w vr4, t0, 3 vseq.b vr2, vr4, vr2 vseqi.b vr2, vr2, 0 vld vr0, a2, 16 vld vr1, a2, 48 vld vr12, a2, 80 vld vr13, a2, 112 vld vr4, a2, 144 vpickev.h vr5, vr1, vr0 vpickev.h vr14, vr13, vr12 vpickev.h vr7, vr4, vr4 vbsrl.v vr6, vr5, 8 vilvl.d vr6, vr14, vr6 vilvh.d vr9, vr7, vr14 vabsd.h vr5, vr6, vr5 vabsd.h vr9, vr9, vr14 vsle.h vr5, vr20, vr5 vsle.h vr9, vr20, vr9 vpickod.h vr6, vr1, vr0 vpickod.h vr14, vr13, vr12 vpickod.h vr7, vr4, vr4 vbsrl.v vr8, vr6, 8 vilvl.d vr8, vr14, vr8 vilvh.d vr7, vr7, vr14 vabsd.h vr8, vr8, vr6 vabsd.h vr7, vr7, vr14 vsle.h vr8, vr21, vr8 vsle.h vr6, vr21, vr7 vor.v vr5, vr5, vr8 vor.v vr6, vr9, vr6 vpickev.b vr6, vr6, vr5 vor.v vr2, vr6, vr2 beqz a5, .bframeiszero_1_lsx // bframe != 0 ref[1] vld vr0, a1, 44 vld vr1, a1, 60 ld.w t0, a1, 76 vpickev.w vr0, vr1, vr0 vbsrl.v vr1, vr0, 4 vinsgr2vr.w vr1, t0, 3 vseq.b vr11, vr1, vr0 vseqi.b vr11, vr11, 0 vld vr0, a2, 176 vld vr1, a2, 208 vld vr12, a2, 240 vld vr13, a2, 272 vld vr4, a2, 304 vpickev.h vr5, vr1, vr0 vpickev.h vr14, vr13, vr12 vpickev.h vr7, vr4, vr4 vbsrl.v vr6, vr5, 8 vilvl.d vr6, vr14, vr6 vilvh.d vr9, vr7, vr14 vabsd.h vr5, vr6, vr5 vabsd.h vr9, vr9, vr14 vsle.h vr5, vr20, vr5 vsle.h vr9, vr20, vr9 vpickod.h vr6, vr1, vr0 vpickod.h vr14, vr13, vr12 vpickod.h vr7, vr4, vr4 vbsrl.v vr8, vr6, 8 vilvl.d vr8, vr14, vr8 vilvh.d vr7, vr7, vr14 vabsd.h vr8, vr8, vr6 vabsd.h vr6, vr7, vr14 vsle.h vr8, vr21, vr8 vsle.h vr6, vr21, vr6 vor.v vr5, vr5, vr8 vor.v vr7, vr9, vr6 vpickev.b vr6, vr7, vr5 vor.v vr6, vr6, vr11 vor.v vr2, vr6, vr2 .bframeiszero_1_lsx: vxor.v vr22, vr22, vr22 vbitsel.v vr22, vr22, vr19, vr2 vbitsel.v vr22, vr3, vr22, vr15 vst vr22, a3, 32 endfunc_x264 /* * void deblock_v_luma_intra_lsx( pixel *pix, intptr_t stride, int alpha, int beta ) */ function_x264 deblock_v_luma_intra_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a1, 2 // Store registers to the stack addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 // Load data from pix sub.d t3, a0, t2 // t3 = a0 - 4 * stride vld vr3, t3, 0 // p3 vldx vr2, t3, a1 // p2 vldx vr1, t3, t0 // p1 vldx vr0, t3, t1 // p0 vld vr10, a0, 0 // q0 vldx vr11, a0, a1 // q1 vldx vr12, a0, t0 // q2 vldx vr13, a0, t1 // q3 vsllwil.hu.bu vr7, vr3, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr14, vr10, 0 vsllwil.hu.bu vr15, vr11, 0 vsllwil.hu.bu vr16, vr12, 0 vsllwil.hu.bu vr17, vr13, 0 /* p0', p1', p2' */ vadd.h vr8, vr5, vr4 vadd.h vr9, vr8, vr14 vadd.h vr19, vr7, vr6 vadd.h vr18, vr6, vr9 // pix[-2*xstride] vslli.h vr19, vr19, 1 vadd.h vr20, vr9, vr18 vadd.h vr19, vr19, vr18 // pix[-3*xstride] vadd.h vr20, vr20, vr15 // pix[-1*xstride] /* p0' */ vadd.h vr8, vr8, vr15 vadd.h vr21, vr8, vr5 // pix[-1*xstride] // /* q0', q1', q2' */ vadd.h vr8, vr15, vr14 vadd.h vr9, vr8, vr4 vadd.h vr23, vr17, vr16 vadd.h vr22, vr9, vr16 // pix[1*xstride] vslli.h vr23, vr23, 1 vadd.h vr24, vr9, vr22 vadd.h vr23, vr23, vr22 // pix[2*xstride] vadd.h vr24, vr24, vr5 // pix[0*xstride] /* q0' */ vadd.h vr8, vr8, vr5 vadd.h vr25, vr8, vr15 // pix[0*xstride] vexth.hu.bu vr7, vr3 vexth.hu.bu vr6, vr2 vexth.hu.bu vr5, vr1 vexth.hu.bu vr4, vr0 vexth.hu.bu vr14, vr10 vexth.hu.bu vr15, vr11 vexth.hu.bu vr16, vr12 vexth.hu.bu vr17, vr13 /* p0', p1', p2' */ vadd.h vr8, vr5, vr4 vadd.h vr9, vr8, vr14 vadd.h vr27, vr6, vr9 // pix[-2*xstride] vadd.h vr28, vr7, vr6 vslli.h vr28, vr28, 1 vadd.h vr29, vr9, vr27 vadd.h vr28, vr28, vr27 // pix[-3*xstride] vadd.h vr29, vr29, vr15 // pix[-1*xstride] /* p0' */ vadd.h vr8, vr8, vr15 vadd.h vr30, vr8, vr5 // pix[-1*xstride] /* q0', q1', q2' */ vadd.h vr8, vr15, vr14 vadd.h vr9, vr8, vr4 vadd.h vr3, vr17, vr16 vadd.h vr31, vr9, vr16 // pix[1*xstride] vslli.h vr3, vr3, 1 vadd.h vr13, vr9, vr31 vadd.h vr3, vr3, vr31 // pix[2*xstride] vadd.h vr13, vr13, vr5 // pix[0*xstride] /* q0' */ vadd.h vr8, vr8, vr5 vadd.h vr9, vr8, vr15 // pix[0*xstride] vsrarni.b.h vr28, vr19, 3 // pix[-3*xstride] vsrarni.b.h vr27, vr18, 2 // pix[-2*xstride] vsrarni.b.h vr29, vr20, 3 // pix[-1*xstride] vsrarni.b.h vr30, vr21, 2 // pix[-1*xstride] p0' vsrarni.b.h vr13, vr24, 3 // pix[ 0*xstride] vsrarni.b.h vr31, vr22, 2 // pix[ 1*xstride] vsrarni.b.h vr3, vr23, 3 // pix[ 2*xstride] vsrarni.b.h vr9, vr25, 2 // pix[ 0*xstride] q0' vreplgr2vr.b vr18, a2 // alpha vreplgr2vr.b vr19, a3 // beta vabsd.bu vr26, vr0, vr10 vabsd.bu vr8, vr1, vr0 vabsd.bu vr16, vr11, vr10 vslt.bu vr20, vr26, vr18 vslt.bu vr21, vr8, vr19 vslt.bu vr22, vr16, vr19 vand.v vr20, vr20, vr21 vand.v vr20, vr20, vr22 // if_1 vsrli.b vr18, vr18, 2 vaddi.bu vr18, vr18, 2 vslt.bu vr26, vr26, vr18 // if_2 vabsd.bu vr23, vr2, vr0 vslt.bu vr23, vr23, vr19 // if_3 vand.v vr16, vr23, vr26 // if_2 && if_3 vnor.v vr24, vr16, vr16 // !(if_2 && if_3) vand.v vr24, vr24, vr20 // if_1 && !(if_2 && if_3) vand.v vr16, vr16, vr20 // if_1 && if_2 && if_3 vbitsel.v vr4, vr2, vr28, vr16 // pix[-3*xstride] vbitsel.v vr5, vr1, vr27, vr16 // pix[-2*xstride] vbitsel.v vr6, vr0, vr30, vr24 vbitsel.v vr6, vr6, vr29, vr16 // pix[-1*xstride] vabsd.bu vr7, vr12, vr10 vslt.bu vr7, vr7, vr19 // if_4 vand.v vr17, vr7, vr26 // if_2 && if_4 vnor.v vr14, vr17, vr17 // !(if_2 && if_4) vand.v vr14, vr14, vr20 // if_1 && !(if_2 && if_4) vand.v vr17, vr17, vr20 // if_1 && if_2 && if_4 vbitsel.v vr15, vr10, vr9, vr14 vbitsel.v vr15, vr15, vr13, vr17 // pix[ 0*xstride] vbitsel.v vr9, vr11, vr31, vr17 // pix[ 1*xstride] vbitsel.v vr13, vr12, vr3, vr17 // pix[ 2*xstride] vstx vr4, t3, a1 vstx vr5, t3, t0 vstx vr6, t3, t1 vst vr15, a0, 0 vstx vr9, a0, a1 vstx vr13, a0, t0 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) */ function_x264 deblock_h_luma_intra_lsx slli.d t0, a1, 1 slli.d t2, a1, 2 addi.d t5, a0, -4 add.d t1, t0, a1 // Store registers to the stack addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 // Load data from pix FLDD_LOADX_4 t5, a1, t0, t1, f10, f11, f12, f13 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23 add.d t5, t5, t2 FLDD_LOADX_4 t5, a1, t0, t1, f24, f25, f26, f27 vilvl.b vr11, vr11, vr10 vilvl.b vr13, vr13, vr12 vilvl.b vr15, vr15, vr14 vilvl.b vr17, vr17, vr16 vilvl.h vr0, vr13, vr11 vilvl.h vr1, vr17, vr15 vilvh.h vr2, vr13, vr11 vilvh.h vr3, vr17, vr15 vilvl.w vr4, vr1, vr0 vilvl.w vr6, vr3, vr2 vilvh.w vr5, vr1, vr0 vilvh.w vr7, vr3, vr2 vilvl.b vr11, vr21, vr20 vilvl.b vr13, vr23, vr22 vilvl.b vr15, vr25, vr24 vilvl.b vr17, vr27, vr26 vilvl.h vr0, vr13, vr11 vilvl.h vr1, vr17, vr15 vilvh.h vr2, vr13, vr11 vilvh.h vr3, vr17, vr15 vilvl.w vr24, vr1, vr0 vilvl.w vr26, vr3, vr2 vilvh.w vr25, vr1, vr0 vilvh.w vr27, vr3, vr2 vilvl.d vr3, vr24, vr4 // p3 vilvh.d vr2, vr24, vr4 // p2 vilvl.d vr1, vr25, vr5 // p1 vilvh.d vr0, vr25, vr5 // p0 vilvl.d vr10, vr26, vr6 // q0 vilvh.d vr11, vr26, vr6 // q1 vilvl.d vr12, vr27, vr7 // q2 vilvh.d vr13, vr27, vr7 // q3 vsllwil.hu.bu vr7, vr3, 0 vsllwil.hu.bu vr6, vr2, 0 vsllwil.hu.bu vr5, vr1, 0 vsllwil.hu.bu vr4, vr0, 0 vsllwil.hu.bu vr14, vr10, 0 vsllwil.hu.bu vr15, vr11, 0 vsllwil.hu.bu vr16, vr12, 0 vsllwil.hu.bu vr17, vr13, 0 /* p0', p1', p2' */ vadd.h vr8, vr5, vr4 vadd.h vr9, vr8, vr14 vadd.h vr19, vr7, vr6 vadd.h vr18, vr6, vr9 // pix[-2*xstride] vslli.h vr19, vr19, 1 vadd.h vr20, vr9, vr18 vadd.h vr19, vr19, vr18 // pix[-3*xstride] vadd.h vr20, vr20, vr15 // pix[-1*xstride] /* p0' */ vadd.h vr8, vr8, vr15 vadd.h vr21, vr8, vr5 // pix[-1*xstride] /* q0', q1', q2' */ vadd.h vr8, vr15, vr14 vadd.h vr9, vr8, vr4 vadd.h vr23, vr17, vr16 vadd.h vr22, vr9, vr16 // pix[1*xstride] vslli.h vr23, vr23, 1 vadd.h vr24, vr9, vr22 vadd.h vr23, vr23, vr22 // pix[2*xstride] vadd.h vr24, vr24, vr5 // pix[0*xstride] /* q0' */ vadd.h vr8, vr8, vr5 vadd.h vr25, vr8, vr15 // pix[0*xstride] vexth.hu.bu vr7, vr3 vexth.hu.bu vr6, vr2 vexth.hu.bu vr5, vr1 vexth.hu.bu vr4, vr0 vexth.hu.bu vr14, vr10 vexth.hu.bu vr15, vr11 vexth.hu.bu vr16, vr12 vexth.hu.bu vr17, vr13 /* p0', p1', p2' */ vadd.h vr8, vr5, vr4 vadd.h vr9, vr8, vr14 vadd.h vr27, vr6, vr9 // pix[-2*xstride] vadd.h vr28, vr7, vr6 vslli.h vr28, vr28, 1 vadd.h vr29, vr9, vr27 vadd.h vr28, vr28, vr27 // pix[-3*xstride] vadd.h vr29, vr29, vr15 // pix[-1*xstride] /* p0' */ vadd.h vr8, vr8, vr15 vadd.h vr30, vr8, vr5 // pix[-1*xstride] /* q0', q1', q2' */ vadd.h vr8, vr15, vr14 vadd.h vr9, vr8, vr4 vadd.h vr3, vr17, vr16 vadd.h vr31, vr9, vr16 // pix[1*xstride] vslli.h vr3, vr3, 1 vadd.h vr13, vr9, vr31 vadd.h vr3, vr3, vr31 // pix[2*xstride] vadd.h vr13, vr13, vr5 // pix[0*xstride] /* q0' */ vadd.h vr8, vr8, vr5 vadd.h vr9, vr8, vr15 // pix[0*xstride] vsrarni.b.h vr28, vr19, 3 // pix[-3*xstride] vsrarni.b.h vr27, vr18, 2 // pix[-2*xstride] vsrarni.b.h vr29, vr20, 3 // pix[-1*xstride] vsrarni.b.h vr30, vr21, 2 // pix[-1*xstride] p0' vsrarni.b.h vr13, vr24, 3 // pix[ 0*xstride] vsrarni.b.h vr31, vr22, 2 // pix[ 1*xstride] vsrarni.b.h vr3, vr23, 3 // pix[ 2*xstride] vsrarni.b.h vr9, vr25, 2 // pix[ 0*xstride] q0' vreplgr2vr.b vr18, a2 // alpha vreplgr2vr.b vr19, a3 // beta vabsd.bu vr26, vr0, vr10 vabsd.bu vr8, vr1, vr0 vabsd.bu vr16, vr11, vr10 vslt.bu vr20, vr26, vr18 vslt.bu vr21, vr8, vr19 vslt.bu vr22, vr16, vr19 vand.v vr20, vr20, vr21 vand.v vr20, vr20, vr22 // if_1 vsrli.b vr18, vr18, 2 vaddi.bu vr18, vr18, 2 vslt.bu vr26, vr26, vr18 // if_2 vabsd.bu vr23, vr2, vr0 vslt.bu vr23, vr23, vr19 // if_3 vand.v vr16, vr23, vr26 // if_2 && if_3 vnor.v vr24, vr16, vr16 // !(if_2 && if_3) vand.v vr24, vr24, vr20 // if_1 && !(if_2 && if_3) vand.v vr16, vr16, vr20 // if_1 && if_2 && if_3 vbitsel.v vr4, vr2, vr28, vr16 // pix[-3*xstride] vbitsel.v vr5, vr1, vr27, vr16 // pix[-2*xstride] vbitsel.v vr6, vr0, vr30, vr24 vbitsel.v vr6, vr6, vr29, vr16 // pix[-1*xstride] vabsd.bu vr7, vr12, vr10 vslt.bu vr7, vr7, vr19 // if_4 vand.v vr17, vr7, vr26 // if_2 && if_4 vnor.v vr14, vr17, vr17 // !(if_2 && if_4) vand.v vr14, vr14, vr20 // if_1 && !(if_2 && if_4) vand.v vr17, vr17, vr20 // if_1 && if_2 && if_4 vbitsel.v vr15, vr10, vr9, vr14 vbitsel.v vr15, vr15, vr13, vr17 // pix[ 0*xstride] vbitsel.v vr9, vr11, vr31, vr17 // pix[ 1*xstride] vbitsel.v vr13, vr12, vr3, vr17 // pix[ 2*xstride] vilvl.b vr16, vr5, vr4 vilvl.b vr17, vr15, vr6 vilvl.b vr18, vr13, vr9 vilvh.b vr19, vr5, vr4 vilvh.b vr20, vr15, vr6 vilvh.b vr21, vr13, vr9 vilvl.h vr0, vr17, vr16 vilvh.h vr1, vr17, vr16 vilvl.h vr2, vr20, vr19 vilvh.h vr3, vr20, vr19 addi.d t6, a0, -3 // t6 = a0 -3 vstelm.w vr0, t6, 0, 0 vstelm.h vr18, t6, 4, 0 add.d t6, t6, a1 vstelm.w vr0, t6, 0, 1 vstelm.h vr18, t6, 4, 1 add.d t6, t6, a1 vstelm.w vr0, t6, 0, 2 vstelm.h vr18, t6, 4, 2 add.d t6, t6, a1 vstelm.w vr0, t6, 0, 3 vstelm.h vr18, t6, 4, 3 add.d t6, t6, a1 vstelm.w vr1, t6, 0, 0 vstelm.h vr18, t6, 4, 4 add.d t6, t6, a1 vstelm.w vr1, t6, 0, 1 vstelm.h vr18, t6, 4, 5 add.d t6, t6, a1 vstelm.w vr1, t6, 0, 2 vstelm.h vr18, t6, 4, 6 add.d t6, t6, a1 vstelm.w vr1, t6, 0, 3 vstelm.h vr18, t6, 4, 7 add.d t6, t6, a1 vstelm.w vr2, t6, 0, 0 vstelm.h vr21, t6, 4, 0 add.d t6, t6, a1 vstelm.w vr2, t6, 0, 1 vstelm.h vr21, t6, 4, 1 add.d t6, t6, a1 vstelm.w vr2, t6, 0, 2 vstelm.h vr21, t6, 4, 2 add.d t6, t6, a1 vstelm.w vr2, t6, 0, 3 vstelm.h vr21, t6, 4, 3 add.d t6, t6, a1 vstelm.w vr3, t6, 0, 0 vstelm.h vr21, t6, 4, 4 add.d t6, t6, a1 vstelm.w vr3, t6, 0, 1 vstelm.h vr21, t6, 4, 5 add.d t6, t6, a1 vstelm.w vr3, t6, 0, 2 vstelm.h vr21, t6, 4, 6 add.d t6, t6, a1 vstelm.w vr3, t6, 0, 3 vstelm.h vr21, t6, 4, 7 fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 #endif /* !HIGH_BIT_DEPTH */ x264-master/common/loongarch/deblock.h000066400000000000000000000056371502133446700201120ustar00rootroot00000000000000/***************************************************************************** * deblock.h: loongarch deblock ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Hao Chen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_LOONGARCH_DEBLOCK_H #define X264_LOONGARCH_DEBLOCK_H #if !HIGH_BIT_DEPTH #define x264_deblock_v_luma_lasx x264_template(deblock_v_luma_lasx) void x264_deblock_v_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_lasx x264_template(deblock_h_luma_lasx) void x264_deblock_h_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_luma_intra_lsx x264_template(deblock_v_luma_intra_lsx) void x264_deblock_v_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_lsx x264_template(deblock_h_luma_intra_lsx) void x264_deblock_h_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_luma_intra_lasx x264_template(deblock_v_luma_intra_lasx) void x264_deblock_v_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_lasx x264_template(deblock_h_luma_intra_lasx) void x264_deblock_h_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_strength_lsx x264_template(deblock_strength_lsx) void x264_deblock_strength_lsx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_strength_lasx x264_template(deblock_strength_lasx) void x264_deblock_strength_lasx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #endif #endif x264-master/common/loongarch/loongson_asm.S000066400000000000000000000533101502133446700211470ustar00rootroot00000000000000/********************************************************************* * Copyright (c) 2022-2024 Loongson Technology Corporation Limited * Contributed by Xiwei Gu * Shiyou Yin * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. *********************************************************************/ /* * This file is a LoongArch assembly helper file and available under ISC * license. It provides a large number of macros and alias to simplify * writing assembly code, especially for LSX and LASX optimizations. * * Any one can modify it or add new features for his/her own purposes. * Contributing a patch will be appreciated as it might be useful for * others as well. Send patches to loongson contributor mentioned above. * * MAJOR version: Usage changes, incompatible with previous version. * MINOR version: Add new macros/functions, or bug fixes. * MICRO version: Comment changes or implementation changes. */ #define LML_VERSION_MAJOR 0 #define LML_VERSION_MINOR 4 #define LML_VERSION_MICRO 0 #define ASM_PREF #define DEFAULT_ALIGN 5 /* *============================================================================ * macros for specific projetc, set them as needed. * Following LoongML macros for your reference. *============================================================================ */ .macro function name, align=DEFAULT_ALIGN .macro endfunc jirl $r0, $r1, 0x0 .size ASM_PREF\name, . - ASM_PREF\name .purgem endfunc .endm .text ; .align \align ; .globl ASM_PREF\name ; .type ASM_PREF\name, @function ; ASM_PREF\name: ; .endm .macro const name, align=DEFAULT_ALIGN .macro endconst .size \name, . - \name .purgem endconst .endm .section .rodata .align \align \name: .endm /* *============================================================================ * LoongArch register alias *============================================================================ */ #define a0 $a0 #define a1 $a1 #define a2 $a2 #define a3 $a3 #define a4 $a4 #define a5 $a5 #define a6 $a6 #define a7 $a7 #define t0 $t0 #define t1 $t1 #define t2 $t2 #define t3 $t3 #define t4 $t4 #define t5 $t5 #define t6 $t6 #define t7 $t7 #define t8 $t8 #define s0 $s0 #define s1 $s1 #define s2 $s2 #define s3 $s3 #define s4 $s4 #define s5 $s5 #define s6 $s6 #define s7 $s7 #define s8 $s8 #define zero $zero #define sp $sp #define ra $ra #define fa0 $fa0 #define fa1 $fa1 #define fa2 $fa2 #define fa3 $fa3 #define fa4 $fa4 #define fa5 $fa5 #define fa6 $fa6 #define fa7 $fa7 #define ft0 $ft0 #define ft1 $ft1 #define ft2 $ft2 #define ft3 $ft3 #define ft4 $ft4 #define ft5 $ft5 #define ft6 $ft6 #define ft7 $ft7 #define ft8 $ft8 #define ft9 $ft9 #define ft10 $ft10 #define ft11 $ft11 #define ft12 $ft12 #define ft13 $ft13 #define ft14 $ft14 #define ft15 $ft15 #define fs0 $fs0 #define fs1 $fs1 #define fs2 $fs2 #define fs3 $fs3 #define fs4 $fs4 #define fs5 $fs5 #define fs6 $fs6 #define fs7 $fs7 #define f0 $f0 #define f1 $f1 #define f2 $f2 #define f3 $f3 #define f4 $f4 #define f5 $f5 #define f6 $f6 #define f7 $f7 #define f8 $f8 #define f9 $f9 #define f10 $f10 #define f11 $f11 #define f12 $f12 #define f13 $f13 #define f14 $f14 #define f15 $f15 #define f16 $f16 #define f17 $f17 #define f18 $f18 #define f19 $f19 #define f20 $f20 #define f21 $f21 #define f22 $f22 #define f23 $f23 #define f24 $f24 #define f25 $f25 #define f26 $f26 #define f27 $f27 #define f28 $f28 #define f29 $f29 #define f30 $f30 #define f31 $f31 #define vr0 $vr0 #define vr1 $vr1 #define vr2 $vr2 #define vr3 $vr3 #define vr4 $vr4 #define vr5 $vr5 #define vr6 $vr6 #define vr7 $vr7 #define vr8 $vr8 #define vr9 $vr9 #define vr10 $vr10 #define vr11 $vr11 #define vr12 $vr12 #define vr13 $vr13 #define vr14 $vr14 #define vr15 $vr15 #define vr16 $vr16 #define vr17 $vr17 #define vr18 $vr18 #define vr19 $vr19 #define vr20 $vr20 #define vr21 $vr21 #define vr22 $vr22 #define vr23 $vr23 #define vr24 $vr24 #define vr25 $vr25 #define vr26 $vr26 #define vr27 $vr27 #define vr28 $vr28 #define vr29 $vr29 #define vr30 $vr30 #define vr31 $vr31 #define xr0 $xr0 #define xr1 $xr1 #define xr2 $xr2 #define xr3 $xr3 #define xr4 $xr4 #define xr5 $xr5 #define xr6 $xr6 #define xr7 $xr7 #define xr8 $xr8 #define xr9 $xr9 #define xr10 $xr10 #define xr11 $xr11 #define xr12 $xr12 #define xr13 $xr13 #define xr14 $xr14 #define xr15 $xr15 #define xr16 $xr16 #define xr17 $xr17 #define xr18 $xr18 #define xr19 $xr19 #define xr20 $xr20 #define xr21 $xr21 #define xr22 $xr22 #define xr23 $xr23 #define xr24 $xr24 #define xr25 $xr25 #define xr26 $xr26 #define xr27 $xr27 #define xr28 $xr28 #define xr29 $xr29 #define xr30 $xr30 #define xr31 $xr31 /* *============================================================================ * LSX/LASX synthesize instructions *============================================================================ */ /* * Description : Dot product of byte vector elements * Arguments : Inputs - vj, vk * Outputs - vd * Return Type - halfword */ .macro vdp2.h.bu vd, vj, vk vmulwev.h.bu \vd, \vj, \vk vmaddwod.h.bu \vd, \vj, \vk .endm .macro vdp2.h.bu.b vd, vj, vk vmulwev.h.bu.b \vd, \vj, \vk vmaddwod.h.bu.b \vd, \vj, \vk .endm .macro vdp2.w.h vd, vj, vk vmulwev.w.h \vd, \vj, \vk vmaddwod.w.h \vd, \vj, \vk .endm .macro xvdp2.h.bu xd, xj, xk xvmulwev.h.bu \xd, \xj, \xk xvmaddwod.h.bu \xd, \xj, \xk .endm .macro xvdp2.h.bu.b xd, xj, xk xvmulwev.h.bu.b \xd, \xj, \xk xvmaddwod.h.bu.b \xd, \xj, \xk .endm .macro xvdp2.w.h xd, xj, xk xvmulwev.w.h \xd, \xj, \xk xvmaddwod.w.h \xd, \xj, \xk .endm /* * Description : Dot product & addition of halfword vector elements * Arguments : Inputs - vj, vk * Outputs - vd * Return Type - twice size of input */ .macro vdp2add.h.bu vd, vj, vk vmaddwev.h.bu \vd, \vj, \vk vmaddwod.h.bu \vd, \vj, \vk .endm .macro vdp2add.h.bu.b vd, vj, vk vmaddwev.h.bu.b \vd, \vj, \vk vmaddwod.h.bu.b \vd, \vj, \vk .endm .macro vdp2add.w.h vd, vj, vk vmaddwev.w.h \vd, \vj, \vk vmaddwod.w.h \vd, \vj, \vk .endm .macro xvdp2add.h.bu.b xd, xj, xk xvmaddwev.h.bu.b \xd, \xj, \xk xvmaddwod.h.bu.b \xd, \xj, \xk .endm .macro xvdp2add.w.h xd, xj, xk xvmaddwev.w.h \xd, \xj, \xk xvmaddwod.w.h \xd, \xj, \xk .endm /* * Description : Range element vj[i] to vk[i] ~ vj[i] * clip: vj > vk ? vj : vk && vj < va ? vj : va */ .macro vclip.h vd, vj, vk, va vmax.h \vd, \vj, \vk vmin.h \vd, \vd, \va .endm .macro vclip.w vd, vj, vk, va vmax.w \vd, \vj, \vk vmin.w \vd, \vd, \va .endm .macro xvclip.h xd, xj, xk, xa xvmax.h \xd, \xj, \xk xvmin.h \xd, \xd, \xa .endm .macro xvclip.w xd, xj, xk, xa xvmax.w \xd, \xj, \xk xvmin.w \xd, \xd, \xa .endm /* * Description : Range element vj[i] to 0 ~ 255 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 */ .macro vclip255.h vd, vj vmaxi.h \vd, \vj, 0 vsat.hu \vd, \vd, 7 .endm .macro vclip255.w vd, vj vmaxi.w \vd, \vj, 0 vsat.wu \vd, \vd, 7 .endm .macro xvclip255.h xd, xj xvmaxi.h \xd, \xj, 0 xvsat.hu \xd, \xd, 7 .endm .macro xvclip255.w xd, xj xvmaxi.w \xd, \xj, 0 xvsat.wu \xd, \xd, 7 .endm /* * Description : Store elements of vector * vd : Data vector to be stroed * rk : Address of data storage * ra : Offset of address * si : Index of data in vd */ .macro vstelmx.b vd, rk, ra, si add.d \rk, \rk, \ra vstelm.b \vd, \rk, 0, \si .endm .macro vstelmx.h vd, rk, ra, si add.d \rk, \rk, \ra vstelm.h \vd, \rk, 0, \si .endm .macro vstelmx.w vd, rk, ra, si add.d \rk, \rk, \ra vstelm.w \vd, \rk, 0, \si .endm .macro vstelmx.d vd, rk, ra, si add.d \rk, \rk, \ra vstelm.d \vd, \rk, 0, \si .endm .macro vmov xd, xj vor.v \xd, \xj, \xj .endm .macro xmov xd, xj xvor.v \xd, \xj, \xj .endm .macro xvstelmx.d xd, rk, ra, si add.d \rk, \rk, \ra xvstelm.d \xd, \rk, 0, \si .endm /* *============================================================================ * LSX/LASX custom macros *============================================================================ */ /* * Load 4 float, double, V128, v256 elements with stride. */ .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 fld.s \out0, \src, 0 fldx.s \out1, \src, \stride fldx.s \out2, \src, \stride2 fldx.s \out3, \src, \stride3 .endm .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 fld.d \out0, \src, 0 fldx.d \out1, \src, \stride fldx.d \out2, \src, \stride2 fldx.d \out3, \src, \stride3 .endm .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 vld \out0, \src, 0 vldx \out1, \src, \stride vldx \out2, \src, \stride2 vldx \out3, \src, \stride3 .endm .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 xvld \out0, \src, 0 xvldx \out1, \src, \stride xvldx \out2, \src, \stride2 xvldx \out3, \src, \stride3 .endm /* * Description : Transpose 4x4 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 vilvl.h \tmp0, \in1, \in0 vilvl.h \tmp1, \in3, \in2 vilvl.w \out0, \tmp1, \tmp0 vilvh.w \out2, \tmp1, \tmp0 vilvh.d \out1, \out0, \out0 vilvh.d \out3, \out0, \out2 .endm /* * Description : Transpose 4x4 block with word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 * Details : * Example : * 1, 2, 3, 4 1, 5, 9,13 * 5, 6, 7, 8 to 2, 6,10,14 * 9,10,11,12 =====> 3, 7,11,15 * 13,14,15,16 4, 8,12,16 */ .macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 vilvl.w \tmp0, \in1, \in0 vilvh.w \out1, \in1, \in0 vilvl.w \tmp1, \in3, \in2 vilvh.w \out3, \in3, \in2 vilvl.d \out0, \tmp1, \tmp0 vilvl.d \out2, \out3, \out1 vilvh.d \out3, \out3, \out1 vilvh.d \out1, \tmp1, \tmp0 .endm /* * Description : Transpose 8x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 */ .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ tmp3, tmp4, tmp5, tmp6, tmp7 vilvl.h \tmp0, \in6, \in4 vilvl.h \tmp1, \in7, \in5 vilvl.h \tmp2, \in2, \in0 vilvl.h \tmp3, \in3, \in1 vilvl.h \tmp4, \tmp1, \tmp0 vilvh.h \tmp5, \tmp1, \tmp0 vilvl.h \tmp6, \tmp3, \tmp2 vilvh.h \tmp7, \tmp3, \tmp2 vilvh.h \tmp0, \in6, \in4 vilvh.h \tmp1, \in7, \in5 vilvh.h \tmp2, \in2, \in0 vilvh.h \tmp3, \in3, \in1 vpickev.d \out0, \tmp4, \tmp6 vpickod.d \out1, \tmp4, \tmp6 vpickev.d \out2, \tmp5, \tmp7 vpickod.d \out3, \tmp5, \tmp7 vilvl.h \tmp4, \tmp1, \tmp0 vilvh.h \tmp5, \tmp1, \tmp0 vilvl.h \tmp6, \tmp3, \tmp2 vilvh.h \tmp7, \tmp3, \tmp2 vpickev.d \out4, \tmp4, \tmp6 vpickod.d \out5, \tmp4, \tmp6 vpickev.d \out6, \tmp5, \tmp7 vpickod.d \out7, \tmp5, \tmp7 .endm /* * Description : Transpose 16x8 block with byte elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 */ .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15, \ out0, out1, out2, out3, out4, out5, out6, out7,\ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 xvilvl.b \tmp0, \in2, \in0 xvilvl.b \tmp1, \in3, \in1 xvilvl.b \tmp2, \in6, \in4 xvilvl.b \tmp3, \in7, \in5 xvilvl.b \tmp4, \in10, \in8 xvilvl.b \tmp5, \in11, \in9 xvilvl.b \tmp6, \in14, \in12 xvilvl.b \tmp7, \in15, \in13 xvilvl.b \out0, \tmp1, \tmp0 xvilvh.b \out1, \tmp1, \tmp0 xvilvl.b \out2, \tmp3, \tmp2 xvilvh.b \out3, \tmp3, \tmp2 xvilvl.b \out4, \tmp5, \tmp4 xvilvh.b \out5, \tmp5, \tmp4 xvilvl.b \out6, \tmp7, \tmp6 xvilvh.b \out7, \tmp7, \tmp6 xvilvl.w \tmp0, \out2, \out0 xvilvh.w \tmp2, \out2, \out0 xvilvl.w \tmp4, \out3, \out1 xvilvh.w \tmp6, \out3, \out1 xvilvl.w \tmp1, \out6, \out4 xvilvh.w \tmp3, \out6, \out4 xvilvl.w \tmp5, \out7, \out5 xvilvh.w \tmp7, \out7, \out5 xvilvl.d \out0, \tmp1, \tmp0 xvilvh.d \out1, \tmp1, \tmp0 xvilvl.d \out2, \tmp3, \tmp2 xvilvh.d \out3, \tmp3, \tmp2 xvilvl.d \out4, \tmp5, \tmp4 xvilvh.d \out5, \tmp5, \tmp4 xvilvl.d \out6, \tmp7, \tmp6 xvilvh.d \out7, \tmp7, \tmp6 .endm /* * Description : Transpose 4x4 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.h \tmp0, \in1, \in0 xvilvl.h \tmp1, \in3, \in2 xvilvl.w \out0, \tmp1, \tmp0 xvilvh.w \out2, \tmp1, \tmp0 xvilvh.d \out1, \out0, \out0 xvilvh.d \out3, \out0, \out2 .endm /* * Description : Transpose 4x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.h \tmp0, \in2, \in0 xvilvl.h \tmp1, \in3, \in1 xvilvl.h \out2, \tmp1, \tmp0 xvilvh.h \out3, \tmp1, \tmp0 xvilvl.d \out0, \out2, \out2 xvilvh.d \out1, \out2, \out2 xvilvl.d \out2, \out3, \out3 xvilvh.d \out3, \out3, \out3 .endm /* * Description : Transpose 8x8 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 */ .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7, \ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 xvilvl.h \tmp0, \in6, \in4 xvilvl.h \tmp1, \in7, \in5 xvilvl.h \tmp2, \in2, \in0 xvilvl.h \tmp3, \in3, \in1 xvilvl.h \tmp4, \tmp1, \tmp0 xvilvh.h \tmp5, \tmp1, \tmp0 xvilvl.h \tmp6, \tmp3, \tmp2 xvilvh.h \tmp7, \tmp3, \tmp2 xvilvh.h \tmp0, \in6, \in4 xvilvh.h \tmp1, \in7, \in5 xvilvh.h \tmp2, \in2, \in0 xvilvh.h \tmp3, \in3, \in1 xvpickev.d \out0, \tmp4, \tmp6 xvpickod.d \out1, \tmp4, \tmp6 xvpickev.d \out2, \tmp5, \tmp7 xvpickod.d \out3, \tmp5, \tmp7 xvilvl.h \tmp4, \tmp1, \tmp0 xvilvh.h \tmp5, \tmp1, \tmp0 xvilvl.h \tmp6, \tmp3, \tmp2 xvilvh.h \tmp7, \tmp3, \tmp2 xvpickev.d \out4, \tmp4, \tmp6 xvpickod.d \out5, \tmp4, \tmp6 xvpickev.d \out6, \tmp5, \tmp7 xvpickod.d \out7, \tmp5, \tmp7 .endm /* * Description : Transpose 2x4x4 block with half-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 */ .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1, tmp2 xvilvh.h \tmp1, \in0, \in1 xvilvl.h \out1, \in0, \in1 xvilvh.h \tmp0, \in2, \in3 xvilvl.h \out3, \in2, \in3 xvilvh.w \tmp2, \out3, \out1 xvilvl.w \out3, \out3, \out1 xvilvl.w \out2, \tmp0, \tmp1 xvilvh.w \tmp1, \tmp0, \tmp1 xvilvh.d \out0, \out2, \out3 xvilvl.d \out2, \out2, \out3 xvilvh.d \out1, \tmp1, \tmp2 xvilvl.d \out3, \tmp1, \tmp2 .endm /* * Description : Transpose 4x4 block with word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 * Details : * Example : * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 */ .macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.w \tmp0, \in1, \in0 xvilvh.w \out1, \in1, \in0 xvilvl.w \tmp1, \in3, \in2 xvilvh.w \out3, \in3, \in2 xvilvl.d \out0, \tmp1, \tmp0 xvilvl.d \out2, \out3, \out1 xvilvh.d \out3, \out3, \out1 xvilvh.d \out1, \tmp1, \tmp0 .endm /* * Description : Transpose 8x8 block with word elements in vectors * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 * Outputs - out0, out1, out2, out3, out4, out5, out6, * _out7 * Example : LASX_TRANSPOSE8x8_W * in0 : 1,2,3,4,5,6,7,8 * in1 : 2,2,3,4,5,6,7,8 * in2 : 3,2,3,4,5,6,7,8 * in3 : 4,2,3,4,5,6,7,8 * in4 : 5,2,3,4,5,6,7,8 * in5 : 6,2,3,4,5,6,7,8 * in6 : 7,2,3,4,5,6,7,8 * in7 : 8,2,3,4,5,6,7,8 * * out0 : 1,2,3,4,5,6,7,8 * out1 : 2,2,2,2,2,2,2,2 * out2 : 3,3,3,3,3,3,3,3 * out3 : 4,4,4,4,4,4,4,4 * out4 : 5,5,5,5,5,5,5,5 * out5 : 6,6,6,6,6,6,6,6 * out6 : 7,7,7,7,7,7,7,7 * out7 : 8,8,8,8,8,8,8,8 */ .macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ out0, out1, out2, out3, out4, out5, out6, out7,\ tmp0, tmp1, tmp2, tmp3 xvilvl.w \tmp0, \in2, \in0 xvilvl.w \tmp1, \in3, \in1 xvilvh.w \tmp2, \in2, \in0 xvilvh.w \tmp3, \in3, \in1 xvilvl.w \out0, \tmp1, \tmp0 xvilvh.w \out1, \tmp1, \tmp0 xvilvl.w \out2, \tmp3, \tmp2 xvilvh.w \out3, \tmp3, \tmp2 xvilvl.w \tmp0, \in6, \in4 xvilvl.w \tmp1, \in7, \in5 xvilvh.w \tmp2, \in6, \in4 xvilvh.w \tmp3, \in7, \in5 xvilvl.w \out4, \tmp1, \tmp0 xvilvh.w \out5, \tmp1, \tmp0 xvilvl.w \out6, \tmp3, \tmp2 xvilvh.w \out7, \tmp3, \tmp2 xmov \tmp0, \out0 xmov \tmp1, \out1 xmov \tmp2, \out2 xmov \tmp3, \out3 xvpermi.q \out0, \out4, 0x02 xvpermi.q \out1, \out5, 0x02 xvpermi.q \out2, \out6, 0x02 xvpermi.q \out3, \out7, 0x02 xvpermi.q \out4, \tmp0, 0x31 xvpermi.q \out5, \tmp1, 0x31 xvpermi.q \out6, \tmp2, 0x31 xvpermi.q \out7, \tmp3, 0x31 .endm /* * Description : Transpose 4x4 block with double-word elements in vectors * Arguments : Inputs - in0, in1, in2, in3 * Outputs - out0, out1, out2, out3 * Example : LASX_TRANSPOSE4x4_D * in0 : 1,2,3,4 * in1 : 1,2,3,4 * in2 : 1,2,3,4 * in3 : 1,2,3,4 * * out0 : 1,1,1,1 * out1 : 2,2,2,2 * out2 : 3,3,3,3 * out3 : 4,4,4,4 */ .macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ tmp0, tmp1 xvilvl.d \tmp0, \in1, \in0 xvilvh.d \out1, \in1, \in0 xvilvh.d \tmp1, \in3, \in2 xvilvl.d \out2, \in3, \in2 xvor.v \out0, \tmp0, \tmp0 xvor.v \out3, \tmp1, \tmp1 xvpermi.q \out0, \out2, 0x02 xvpermi.q \out2, \tmp0, 0x31 xvpermi.q \out3, \out1, 0x31 xvpermi.q \out1, \tmp1, 0x02 .endm x264-master/common/loongarch/loongson_util.S000066400000000000000000000033061502133446700213440ustar00rootroot00000000000000/***************************************************************************** * loongson_util.S: loongson utility macros ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Shiyou Yin * Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #define GLUE(a, b) a ## b #define JOIN(a, b) GLUE(a, b) /* Set prefix as needed. */ #define ASM_REF JOIN(JOIN(x264_, BIT_DEPTH), _) #define FENC_STRIDE 16 #define FDEC_STRIDE 32 .macro function_x264 name, align=DEFAULT_ALIGN .macro endfunc_x264 jirl $r0, $r1, 0x0 .size ASM_REF\name, . - ASM_REF\name .purgem endfunc_x264 .endm .text ; .align \align ; .globl ASM_REF\name ; .type ASM_REF\name, @function ; ASM_REF\name: ; .endm x264-master/common/loongarch/mc-a.S000066400000000000000000002772131502133446700173000ustar00rootroot00000000000000/***************************************************************************** * mc-a.S: LoongArch motion compensation ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" const ch_shuf .byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 .byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 endconst const pw_1024 .rept 16 .short 1024 .endr endconst const filt_mul20 .rept 32 .byte 20 .endr endconst const filt_mul15 .rept 16 .byte 1, -5 .endr endconst const filt_mul51 .rept 16 .byte -5, 1 .endr endconst const hpel_shuf .rept 2 .byte 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 .endr endconst const shuf_12 .rept 2 .byte 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 .endr endconst const shuf_14 .rept 2 .byte 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 .endr endconst const shuf_15 .rept 2 .byte 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 .endr endconst const shuf_1 .rept 2 .byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 .endr endconst const shuf_2 .rept 2 .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 .endr endconst const shuf_3 .rept 2 .byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 .endr endconst const shuf_4 .rept 2 .byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 .endr endconst const shuf_6 .rept 2 .byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 .endr endconst #if !HIGH_BIT_DEPTH .macro MC_CHROMA_START srai.d t0, a5, 3 srai.d t1, a6, 3 slli.d t0, t0, 1 mul.d t1, t1, a4 add.d t1, t1, t0 add.d a3, a3, t1 /* src += (m_vy >> 3) * i_src_stride + (m_vx >> 3) * 2 */ .endm /* * void mc_chroma( uint8_t *p_dst_u, uint8_t *p_dst_v, * intptr_t i_dst_stride, * uint8_t *p_src, intptr_t i_src_stride, * int32_t m_vx, int32_t m_vy, * int32_t i_width, int32_t i_height ) */ function_x264 mc_chroma_lasx MC_CHROMA_START andi a5, a5, 0x07 /* m_vx & 0x07 */ andi a6, a6, 0x07 /* m_vy & 0x07 */ move t0, a5 slli.d t0, t0, 8 sub.d t0, t0, a5 li.d a5, 8 addi.d t0, t0, 8 sub.d a5, a5, a6 mul.d a6, a6, t0 /* (x * 255 + 8) * y */ mul.d a5, a5, t0 /* (x * 255 + 8) * (8 - y) */ xvreplgr2vr.h xr6, a6 /* cD cC ... cD cC */ xvreplgr2vr.h xr7, a5 /* cB cA ... cB cA */ la.local t0, ch_shuf xvld xr5, t0, 0 addi.d t0, a7, -4 ldptr.w a7, sp, 0 /* a7 = i_height */ slli.d t1, a4, 1 blt zero, t0, .L_WIDTH8 .L_LOOP4: vld vr0, a3, 0 vldx vr1, a3, a4 vldx vr2, a3, t1 xvpermi.q xr0, xr1, 0x02 xvpermi.q xr1, xr2, 0x02 xvshuf.b xr0, xr0, xr0, xr5 xvshuf.b xr1, xr1, xr1, xr5 xvdp2.h.bu xr2, xr0, xr7 xvdp2.h.bu xr3, xr1, xr6 xvadd.h xr0, xr2, xr3 xvssrlrni.bu.h xr0, xr0, 6 xvstelm.w xr0, a0, 0, 0 xvstelm.w xr0, a1, 0, 1 add.d a0, a0, a2 add.d a1, a1, a2 xvstelm.w xr0, a0, 0, 4 xvstelm.w xr0, a1, 0, 5 add.d a0, a0, a2 add.d a1, a1, a2 add.d a3, a3, t1 addi.d a7, a7, -2 blt zero, a7, .L_LOOP4 b .ENDFUNC .L_WIDTH8: xvld xr0, a3, 0 xvpermi.d xr0, xr0, 0x94 xvshuf.b xr0, xr0, xr0, xr5 .L_LOOP8: xvldx xr3, a3, a4 xvpermi.d xr3, xr3, 0x94 xvshuf.b xr3, xr3, xr3, xr5 xvdp2.h.bu xr1, xr0, xr7 xvdp2.h.bu xr2, xr3, xr6 xvdp2.h.bu xr8, xr3, xr7 xvldx xr0, a3, t1 xvpermi.d xr0, xr0, 0x94 xvshuf.b xr0, xr0, xr0, xr5 xvdp2.h.bu xr4, xr0, xr6 xvadd.h xr1, xr1, xr2 xvadd.h xr3, xr8, xr4 xvssrlrni.bu.h xr3, xr1, 6 xvpermi.q xr4, xr3, 0x01 xvpackev.w xr8, xr4, xr3 xvpackod.w xr9, xr4, xr3 vstelm.d vr8, a0, 0, 0 vstelm.d vr9, a1, 0, 0 add.d a0, a0, a2 add.d a1, a1, a2 vstelm.d vr8, a0, 0, 1 vstelm.d vr9, a1, 0, 1 addi.d a7, a7, -2 add.d a0, a0, a2 add.d a1, a1, a2 add.d a3, a3, t1 blt zero, a7, .L_LOOP8 .ENDFUNC: endfunc_x264 .macro PIXEL_AVG_START slli.d t0, a3, 1 add.w t1, t0, a3 slli.d t2, a3, 2 slli.d t3, a5, 1 add.w t4, t3, a5 slli.d t5, a5, 2 slli.d t6, a1, 1 add.w t7, t6, a1 slli.d t8, a1, 2 .endm .macro BIWEIGHT_AVG_START addi.d t0, zero, 64 sub.d t0, t0, a6 xvreplgr2vr.b xr0, a6 xvreplgr2vr.b xr1, t0 xvpackev.b xr8, xr1, xr0 xvxor.v xr9, xr9, xr9 xvaddi.hu xr9, xr9, 6 .endm .macro BIWEIGHT_AVG_CORE a, b xvpermi.d \a, \a, 0x50 xvpermi.d \b, \b, 0x50 xvilvl.b \a, \b, \a xvmulwev.h.bu.b \b, \a, xr8 xvmaddwod.h.bu.b \b, \a, xr8 xvssrarn.bu.h \b, \b, xr9 xvpermi.d \b, \b, 0x08 .endm .macro PIXEL_AVG_START_W8 slli.d t0, a3, 1 add.w t1, t0, a3 slli.d t3, a5, 1 add.w t4, t3, a5 .endm function_x264 pixel_avg_weight_w4_lasx addi.d t0, zero, 64 sub.d t0, t0, a6 vreplgr2vr.b vr0, a6 vreplgr2vr.b vr1, t0 vpackev.b vr8, vr1, vr0 .LOOP_HEIGHT_W4_1: fld.s f0, a2, 0 fldx.s f1, a2, a3 fld.s f2, a4, 0 fldx.s f3, a4, a5 vilvl.w vr0, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.b vr0, vr2, vr0 vmulwev.h.bu.b vr1, vr0, vr8 vmaddwod.h.bu.b vr1, vr0, vr8 vssrarni.bu.h vr1, vr1, 6 fst.s f1, a0, 0 add.d a0, a0, a1 vstelm.w vr1, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a4, a5, a4, 1 addi.w a7, a7, -2 bnez a7, .LOOP_HEIGHT_W4_1 endfunc_x264 function_x264 pixel_avg_w4_lasx .LOOP_HEIGHT_W4: fld.s f0, a2, 0 fldx.s f1, a2, a3 fld.s f4, a4, 0 fldx.s f5, a4, a5 vilvl.w vr0, vr1, vr0 vilvl.w vr4, vr5, vr4 vavgr.bu vr0, vr0, vr4 fst.s f0, a0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a4, a5, a4, 1 addi.w a7, a7, -2 bnez a7, .LOOP_HEIGHT_W4 endfunc_x264 function_x264 pixel_avg_weight_w8_lasx addi.d t0, zero, 64 sub.d t0, t0, a6 xvreplgr2vr.b xr0, a6 xvreplgr2vr.b xr1, t0 xvpackev.b xr8, xr1, xr0 PIXEL_AVG_START_W8 .LOOP_HEIGHT_W8_1: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t0 fldx.d f3, a2, t1 fld.d f4, a4, 0 fldx.d f5, a4, a5 fldx.d f6, a4, t3 fldx.d f7, a4, t4 vilvl.b vr0, vr4, vr0 vilvl.b vr1, vr5, vr1 vilvl.b vr2, vr6, vr2 vilvl.b vr3, vr7, vr3 xvpermi.q xr1, xr0, 0x20 xvpermi.q xr3, xr2, 0x20 xvmulwev.h.bu.b xr2, xr1, xr8 xvmaddwod.h.bu.b xr2, xr1, xr8 xvmulwev.h.bu.b xr4, xr3, xr8 xvmaddwod.h.bu.b xr4, xr3, xr8 xvssrarni.bu.h xr4, xr2, 6 fst.d f4, a0, 0 add.d a0, a0, a1 xvstelm.d xr4, a0, 0, 2 add.d a0, a0, a1 xvstelm.d xr4, a0, 0, 1 add.d a0, a0, a1 xvstelm.d xr4, a0, 0, 3 add.d a0, a0, a1 alsl.d a2, a3, a2, 2 alsl.d a4, a5, a4, 2 addi.w a7, a7, -4 bnez a7, .LOOP_HEIGHT_W8_1 endfunc_x264 function_x264 pixel_avg_w8_lasx PIXEL_AVG_START_W8 .LOOP_HEIGHT_W8: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t0 fldx.d f3, a2, t1 fld.d f4, a4, 0 fldx.d f5, a4, a5 fldx.d f6, a4, t3 fldx.d f7, a4, t4 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vavgr.bu vr0, vr0, vr4 vavgr.bu vr2, vr2, vr6 fst.d f0, a0, 0 add.d a0, a0, a1 vstelm.d vr0, a0, 0, 1 fstx.d f2, a0, a1 alsl.d a0, a1, a0, 1 vstelm.d vr2, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 2 alsl.d a4, a5, a4, 2 addi.w a7, a7, -4 bnez a7, .LOOP_HEIGHT_W8 endfunc_x264 function_x264 pixel_avg_weight_w16_lasx BIWEIGHT_AVG_START PIXEL_AVG_START .L_HEIGHT_LOOP_T: LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3 LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7 BIWEIGHT_AVG_CORE xr0, xr4 BIWEIGHT_AVG_CORE xr1, xr5 vst vr4, a0, 0 vstx vr5, a0, a1 BIWEIGHT_AVG_CORE xr2, xr6 BIWEIGHT_AVG_CORE xr3, xr7 vstx vr6, a0, t6 vstx vr7, a0, t7 add.d a2, a2, t2 add.d a4, a4, t5 add.d a0, a0, t8 addi.d a7, a7, -4 bnez a7, .L_HEIGHT_LOOP_T endfunc_x264 function_x264 pixel_avg_w16_lasx PIXEL_AVG_START .L_HEIGHT_LOOP: vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t0 vldx vr3, a2, t1 vld vr4, a4, 0 vldx vr5, a4, a5 vldx vr6, a4, t3 vldx vr7, a4, t4 vavgr.bu vr0, vr0, vr4 vavgr.bu vr1, vr1, vr5 vavgr.bu vr2, vr2, vr6 vavgr.bu vr3, vr3, vr7 vst vr0, a0, 0 vstx vr1, a0, a1 vstx vr2, a0, t6 vstx vr3, a0, t7 add.d a0, a0, t8 add.d a2, a2, t2 add.d a4, a4, t5 vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t0 vldx vr3, a2, t1 vld vr4, a4, 0 vldx vr5, a4, a5 vldx vr6, a4, t3 vldx vr7, a4, t4 vavgr.bu vr0, vr0, vr4 vavgr.bu vr1, vr1, vr5 vavgr.bu vr2, vr2, vr6 vavgr.bu vr3, vr3, vr7 vst vr0, a0, 0 vstx vr1, a0, a1 vstx vr2, a0, t6 vstx vr3, a0, t7 add.d a2, a2, t2 add.d a4, a4, t5 add.d a0, a0, t8 addi.d a7, a7, -8 bnez a7, .L_HEIGHT_LOOP endfunc_x264 .macro FILT_PACK_LASX s1, s2, s3 xvmulwev.w.h xr16, \s1, \s3 xvmulwev.w.h xr17, \s2, \s3 xvsrarni.h.w xr17, xr16, 15 xvmaxi.h xr17, xr17, 0 xvsat.hu xr17, xr17, 7 xvmulwod.w.h xr18, \s1, \s3 xvmulwod.w.h xr19, \s2, \s3 xvsrarni.h.w xr19, xr18, 15 xvmaxi.h xr19, xr19, 0 xvsat.hu xr19, xr19, 7 xvpackev.b \s1, xr19, xr17 .endm /* s3: temp, s4: UNUSED, s5: imm */ .macro DO_FILT_V_LASX s1, s2, s3, s4, s5 alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */ alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */ xvld xr1, a3, 0 xvldx xr2, a3, a2 xvld \s3, t2, 0 xvld xr3, a1, 0 xvldx \s1, a1, a2 xvld \s2, t1, 0 xvilvh.b xr16, xr2, xr1 xvilvl.b xr17, xr2, xr1 xvilvh.b xr18, \s2, \s1 xvilvl.b xr19, \s2, \s1 xvilvh.b xr20, \s3, xr3 xvilvl.b xr21, \s3, xr3 xvdp2.h.bu.b xr1, xr17, xr12 xvdp2.h.bu.b xr4, xr16, xr12 xvdp2.h.bu.b \s1, xr19, xr0 xvdp2.h.bu.b xr2, xr18, xr0 xvdp2.h.bu.b xr3, xr21, xr14 xvdp2.h.bu.b \s2, xr20, xr14 xvadd.h xr1, xr1, \s1 xvadd.h xr4, xr4, xr2 xvadd.h xr1, xr1, xr3 xvadd.h xr4, xr4, \s2 xmov \s1, xr1 xmov \s2, xr1 addi.d a3, a3, 32 addi.d a1, a1, 32 xvpermi.q \s1, xr4, 0x2 xvpermi.q \s2, xr4, 0x13 FILT_PACK_LASX xr1, xr4, xr15 addi.d t1, a4, \s5 xvstx xr1, t0, t1 .endm .macro FILT_H s1, s2, s3 xvsub.h \s1, \s1, \s2 xvsrai.h \s1, \s1, 2 xvsub.h \s1, \s1, \s2 xvadd.h \s1, \s1, \s3 xvsrai.h \s1, \s1, 2 xvadd.h \s1, \s1, \s3 .endm .macro FILT_C s1, s2, s3 xmov xr3, \s1 xvpermi.q xr3, \s2, 0x03 xvshuf.b xr1, \s2, xr3, xr23 xvshuf.b xr2, \s2, xr3, xr24 xmov \s1, \s2 xvpermi.q \s1, \s3, 0x03 xvshuf.b xr3, \s1, \s2, xr29 xvshuf.b xr4, \s1, \s2, xr27 xvadd.h xr3, xr2, xr3 xmov xr2, \s1 xmov \s1, \s3 xvshuf.b \s3, xr2, \s2, xr30 xvadd.h xr4, xr4, \s2 xvadd.h \s3, \s3, xr1 FILT_H \s3, xr3, xr4 .endm .macro DO_FILT_C_LASX s1, s2, s3, s4 FILT_C \s1, \s2, \s3 FILT_C \s2, \s1, \s4 FILT_PACK_LASX \s3, \s4, xr15 xvpermi.d \s3, \s3, 0xd8 xvstx \s3, a5, a4 .endm .macro DO_FILT_H_LASX s1, s2, s3 xmov xr3, \s1 xvpermi.q xr3, \s2, 0x03 xvshuf.b xr1, \s2, xr3, xr24 xvshuf.b xr2, \s2, xr3, xr25 xmov xr3, \s2 xvpermi.q xr3, \s3, 0x03 xvshuf.b xr4, xr3, \s2, xr26 xvshuf.b xr5, xr3, \s2, xr27 xvshuf.b xr6, xr3, \s2, xr28 xmov \s1, \s2 xvdp2.h.bu.b xr16, xr1, xr12 xvdp2.h.bu.b xr17, xr2, xr12 xvdp2.h.bu.b xr18, \s2, xr14 xvdp2.h.bu.b xr19, xr4, xr14 xvdp2.h.bu.b xr20, xr5, xr0 xvdp2.h.bu.b xr21, xr6, xr0 xvadd.h xr1, xr16, xr18 xvadd.h xr2, xr17, xr19 xvadd.h xr1, xr1, xr20 xvadd.h xr2, xr2, xr21 FILT_PACK_LASX xr1, xr2, xr15 xvshuf.b xr1, xr1, xr1, xr22 xvstx xr1, a0, a4 xmov \s2, \s3 .endm /* * void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, * uint8_t *src, intptr_t stride, int width, int height ) */ function_x264 hpel_filter_lasx addi.d sp, sp, -56 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 move a7, a3 addi.d a5, a5, -32 move t0, a1 andi a7, a7, 31 sub.d a3, a3, a7 add.d a0, a0, a5 add.d t0, t0, a5 add.d a7, a7, a5 add.d a5, a5, a2 move a2, a4 sub.d a7, zero, a7 add.d a1, a3, a2 sub.d a3, a3, a2 sub.d a3, a3, a2 move a4, a7 la.local t1, filt_mul51 xvld xr0, t1, 0 la.local t2, filt_mul15 xvld xr12, t2, 0 la.local t3, filt_mul20 xvld xr14, t3, 0 la.local t4, pw_1024 xvld xr15, t4, 0 la.local t1, hpel_shuf xvld xr22, t1, 0 la.local t2, shuf_12 xvld xr23, t2, 0 la.local t3, shuf_1 xvld xr26, t3, 0 xvaddi.bu xr24, xr23, 2 /* shuf_14 */ xvaddi.bu xr25, xr23, 3 /* shuf_15 */ xvaddi.bu xr27, xr26, 1 /* shuf_2 */ xvaddi.bu xr28, xr26, 2 /* shuf_3 */ xvaddi.bu xr29, xr26, 3 /* shuf_4 */ xvaddi.bu xr30, xr26, 5 /* shuf_6 */ xvxor.v xr9, xr9, xr9 xvxor.v xr10, xr10, xr10 .LOOPY: DO_FILT_V_LASX xr8, xr7, xr13, xr12, 0 .LOOPX: DO_FILT_V_LASX xr6, xr5, xr11, xr12, 32 .LASTX: xvsrli.h xr15, xr15, 1 DO_FILT_C_LASX xr9, xr8, xr7, xr6 xvadd.h xr15, xr15, xr15 xmov xr7, xr5 DO_FILT_H_LASX xr10, xr13, xr11 addi.d a4, a4, 32 blt a4, zero, .LOOPX addi.d t1, a4, -32 blt t1, zero, .LASTX //setup regs for next y sub.d a4, a4, a7 sub.d a4, a4, a2 sub.d a1, a1, a4 sub.d a3, a3, a4 add.d a0, a0, a2 add.d t0, t0, a2 add.d a5, a5, a2 move a4, a7 addi.d a6, a6, -1 blt zero, a6, .LOOPY fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 addi.d sp, sp, 56 endfunc_x264 /* * void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, * pixel *src2, intptr_t src2_stride, int weight); */ .macro PIXEL_AVG w, h function_x264 pixel_avg_\w\()x\h\()_lasx addi.d t0, a6, -32 addi.d a7, zero, \h bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lasx b x264_8_pixel_avg_w\w\()_lasx endfunc_x264 .endm PIXEL_AVG 16, 8 PIXEL_AVG 8, 16 PIXEL_AVG 8, 8 PIXEL_AVG 8, 4 PIXEL_AVG 4, 16 PIXEL_AVG 4, 8 PIXEL_AVG 4, 4 PIXEL_AVG 4, 2 function_x264 mc_weight_w20_noden_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.b xr0, a4, 36 // scale .LOOP_WEIGHTW20_NODEN: xvld xr3, a2, 0 xvldx xr4, a2, a3 xvmulwev.h.bu.b xr7, xr3, xr0 xvmulwev.h.bu.b xr8, xr4, xr0 xvmulwod.h.bu.b xr3, xr3, xr0 xvmulwod.h.bu.b xr4, xr4, xr0 xvadd.h xr7, xr7, xr1 xvadd.h xr8, xr8, xr1 xvadd.h xr3, xr3, xr1 xvadd.h xr4, xr4, xr1 xvssrarni.bu.h xr8, xr7, 0 xvssrarni.bu.h xr4, xr3, 0 xvilvl.b xr3, xr4, xr8 xvilvh.b xr4, xr4, xr8 vst vr3, a0, 0 xvstelm.w xr3, a0, 16, 4 add.d a0, a0, a1 vst vr4, a0, 0 xvstelm.w xr4, a0, 16, 4 alsl.d a2, a3, a2, 1 add.d a0, a0, a1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW20_NODEN endfunc_x264 function_x264 mc_weight_w16_noden_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.h xr0, a4, 36 // scale .LOOP_WEIGHTW16_NODEN: vld vr3, a2, 0 vldx vr4, a2, a3 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr4, xr4 xvmul.h xr3, xr3, xr0 xvmul.h xr4, xr4, xr0 xvadd.h xr3, xr3, xr1 xvadd.h xr4, xr4, xr1 xvssrarni.bu.h xr4, xr3, 0 xvpermi.d xr3, xr4, 8 xvpermi.d xr4, xr4, 13 vst vr3, a0, 0 vstx vr4, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW16_NODEN endfunc_x264 function_x264 mc_weight_w8_noden_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.h xr0, a4, 36 // scale .LOOP_WEIGHTW8_NODEN: fld.d f3, a2, 0 fldx.d f4, a2, a3 vilvl.d vr3, vr4, vr3 vext2xv.hu.bu xr3, xr3 xvmul.h xr3, xr3, xr0 xvadd.h xr3, xr3, xr1 xvssrarni.bu.h xr3, xr3, 0 xvstelm.d xr3, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr3, a0, 0, 2 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW8_NODEN endfunc_x264 function_x264 mc_weight_w4_noden_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.h xr0, a4, 36 // scale .LOOP_WEIGHTW4_NODEN: fld.s f3, a2, 0 fldx.s f4, a2, a3 vilvl.w vr3, vr4, vr3 vext2xv.hu.bu xr3, xr3 xvmul.h xr3, xr3, xr0 xvadd.h xr3, xr3, xr1 xvssrarni.bu.h xr3, xr3, 0 xvstelm.w xr3, a0, 0, 0 add.d a0, a0, a1 xvstelm.w xr3, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW4_NODEN endfunc_x264 function_x264 mc_weight_w20_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.b xr0, a4, 36 // scale xvldrepl.h xr2, a4, 32 // denom xvsll.h xr1, xr1, xr2 .LOOP_WEIGHTW20: xvld xr3, a2, 0 xvldx xr4, a2, a3 xvmulwev.h.bu.b xr7, xr3, xr0 xvmulwev.h.bu.b xr8, xr4, xr0 xvmulwod.h.bu.b xr3, xr3, xr0 xvmulwod.h.bu.b xr4, xr4, xr0 xvsadd.h xr7, xr7, xr1 xvsadd.h xr8, xr8, xr1 xvsadd.h xr3, xr3, xr1 xvsadd.h xr4, xr4, xr1 xvssrarn.bu.h xr7, xr7, xr2 xvssrarn.bu.h xr8, xr8, xr2 xvssrarn.bu.h xr3, xr3, xr2 xvssrarn.bu.h xr4, xr4, xr2 xvilvl.b xr3, xr3, xr7 xvilvl.b xr4, xr4, xr8 vst vr3, a0, 0 xvstelm.w xr3, a0, 16, 4 add.d a0, a0, a1 vst vr4, a0, 0 xvstelm.w xr4, a0, 16, 4 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW20 endfunc_x264 function_x264 mc_weight_w16_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.h xr0, a4, 36 // scale xvldrepl.h xr2, a4, 32 // denom xvsll.h xr1, xr1, xr2 .LOOP_WEIGHTW16: vld vr3, a2, 0 vldx vr4, a2, a3 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr4, xr4 xvmul.h xr3, xr3, xr0 xvmul.h xr4, xr4, xr0 xvsadd.h xr3, xr3, xr1 xvsadd.h xr4, xr4, xr1 xvssrarn.bu.h xr3, xr3, xr2 xvssrarn.bu.h xr4, xr4, xr2 xvpermi.d xr3, xr3, 8 xvpermi.d xr4, xr4, 8 vst vr3, a0, 0 vstx vr4, a0, a1 alsl.d a0, a1, a0, 1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW16 endfunc_x264 function_x264 mc_weight_w8_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.h xr0, a4, 36 // scale xvldrepl.h xr2, a4, 32 // denom xvsll.h xr1, xr1, xr2 .LOOP_WEIGHTW8: fld.d f3, a2, 0 fldx.d f4, a2, a3 vilvl.d vr3, vr4, vr3 vext2xv.hu.bu xr3, xr3 xvmul.h xr3, xr3, xr0 xvsadd.h xr3, xr3, xr1 xvssrarn.bu.h xr3, xr3, xr2 xvstelm.d xr3, a0, 0, 0 add.d a0, a0, a1 xvstelm.d xr3, a0, 0, 2 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW8 endfunc_x264 function_x264 mc_weight_w4_lasx xvldrepl.h xr1, a4, 40 // offset xvldrepl.h xr0, a4, 36 // scale xvldrepl.h xr2, a4, 32 // denom xvsll.h xr1, xr1, xr2 .LOOP_WEIGHTW4: fld.s f3, a2, 0 fldx.s f4, a2, a3 vilvl.w vr3, vr4, vr3 vext2xv.hu.bu xr3, xr3 xvmul.h xr3, xr3, xr0 xvsadd.h xr3, xr3, xr1 xvssrarn.bu.h xr3, xr3, xr2 xvstelm.w xr3, a0, 0, 0 add.d a0, a0, a1 xvstelm.w xr3, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHTW4 endfunc_x264 /* * void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w4_lasx .avg2w4_loop_2: addi.d a5, a5, -2 fld.s f0, a2, 0 fld.s f1, a4, 0 fldx.s f2, a2, a3 fldx.s f3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 vavgr.bu vr0, vr0, vr1 vavgr.bu vr1, vr2, vr3 fst.s f0, a0, 0 fstx.s f1, a0, a1 alsl.d a0, a1, a0, 1 blt zero, a5, .avg2w4_loop_2 endfunc_x264 /* * void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w8_lasx .avg2w8_loop_2: addi.d a5, a5, -2 fld.d f0, a2, 0 fld.d f1, a4, 0 fldx.d f2, a2, a3 fldx.d f3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 vavgr.bu vr0, vr0, vr1 vavgr.bu vr1, vr2, vr3 fst.d f0, a0, 0 fstx.d f1, a0, a1 alsl.d a0, a1, a0, 1 blt zero, a5, .avg2w8_loop_2 endfunc_x264 /* * void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w16_lasx .avg2w16_loop_2: addi.d a5, a5, -2 vld vr0, a2, 0 vldx vr1, a2, a3 vld vr2, a4, 0 vldx vr3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 vavgr.bu vr0, vr0, vr2 vavgr.bu vr1, vr1, vr3 vst vr0, a0, 0 vstx vr1, a0, a1 alsl.d a0, a1, a0, 1 blt zero, a5, .avg2w16_loop_2 endfunc_x264 /* * void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w20_lasx .avg2w20_loop_2: addi.d a5, a5, -2 xvld xr0, a2, 0 xvldx xr1, a2, a3 xvld xr2, a4, 0 xvldx xr3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 xvavgr.bu xr0, xr0, xr2 xvavgr.bu xr1, xr1, xr3 vst vr0, a0, 0 xvstelm.w xr0, a0, 16, 4 add.d a0, a0, a1 vst vr1, a0, 0 xvstelm.w xr1, a0, 16, 4 add.d a0, a0, a1 blt zero, a5, .avg2w20_loop_2 endfunc_x264 /* * void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride, * uint8_t *p_src, int32_t i_src_stride, * int32_t i_height ) */ function_x264 mc_copy_w16_lasx slli.d t0, a3, 1 add.d t1, t0, a3 slli.d t2, a1, 1 add.d t3, t2, a1 .LOOP_COPYW16: vld vr1, a2, 0 vldx vr2, a2, a3 vldx vr3, a2, t0 vldx vr4, a2, t1 vst vr1, a0, 0 vstx vr2, a0, a1 vstx vr3, a0, t2 vstx vr4, a0, t3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 addi.w a4, a4, -4 blt zero, a4, .LOOP_COPYW16 endfunc_x264 /* * void mc_copy_w8( uint8_t *p_dst, intptr_t i_dst_stride, * uint8_t *p_src, intptr_t i_src_stride, * int32_t i_height ) */ function_x264 mc_copy_w8_lasx slli.d t0, a3, 1 add.d t1, t0, a3 slli.d t2, a1, 1 add.d t3, t2, a1 .LOOP_COPYW8: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t0 fldx.d f3, a2, t1 fst.d f0, a0, 0 fstx.d f1, a0, a1 fstx.d f2, a0, t2 fstx.d f3, a0, t3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 addi.w a4, a4, -4 blt zero, a4, .LOOP_COPYW8 endfunc_x264 /* * void mc_copy_w4( uint8_t *p_dst, intptr_t i_dst_stride, * uint8_t *p_src, intptr_t i_src_stride, * int32_t i_height ) */ function_x264 mc_copy_w4_lasx slli.d t0, a3, 1 add.d t1, t0, a3 slli.d t2, a1, 1 add.d t3, t2, a1 .LOOP_COPYW4: fld.s f0, a2, 0 fldx.s f1, a2, a3 fldx.s f2, a2, t0 fldx.s f3, a2, t1 fst.s f0, a0, 0 fstx.s f1, a0, a1 fstx.s f2, a0, t2 fstx.s f3, a0, t3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 addi.w a4, a4, -4 blt zero, a4, .LOOP_COPYW4 endfunc_x264 /* * void memzero_aligned( void *p_dst, size_t n ) */ function_x264 memzero_aligned_lasx xvxor.v xr1, xr1, xr1 .memzero_loop: addi.d a1, a1, -128 .rept 4 xvst xr1, a0, 0 addi.d a0, a0, 32 .endr blt zero, a1, .memzero_loop endfunc_x264 /* * void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, * pixel *dstv, pixel *dstc, intptr_t src_stride, * intptr_t dst_stride, int width, int height ) */ function_x264 frame_init_lowres_core_lasx andi t1, a7, 15 sub.w t0, a7, t1 slli.d t2, a5, 1 ldptr.w a7, sp, 0 // use a7 as height variable .height_loop: add.d t4, zero, t0 addi.d t3, a0, 0 addi.d t5, a1, 0 addi.d t6, a2, 0 addi.d t7, a3, 0 addi.d t8, a4, 0 .width16_loop: xvld xr0, t3, 0 xvldx xr1, t3, a5 xvldx xr2, t3, t2 xvavgr.bu xr3, xr0, xr1 xvavgr.bu xr4, xr1, xr2 xvhaddw.hu.bu xr5, xr3, xr3 xvhaddw.hu.bu xr6, xr4, xr4 xvssrarni.bu.h xr6, xr5, 1 xvpermi.d xr7, xr6, 0xd8 vst vr7, t5, 0 xvpermi.q xr7, xr7, 0x11 vst vr7, t7, 0 addi.d t3, t3, 1 xvld xr0, t3, 0 xvldx xr1, t3, a5 xvldx xr2, t3, t2 xvavgr.bu xr3, xr0, xr1 xvavgr.bu xr4, xr1, xr2 xvhaddw.hu.bu xr5, xr3, xr3 xvhaddw.hu.bu xr6, xr4, xr4 xvssrarni.bu.h xr6, xr5, 1 xvpermi.d xr7, xr6, 0xd8 vst vr7, t6, 0 xvpermi.q xr7, xr7, 0x11 vst vr7, t8, 0 addi.d t3, t3, 31 addi.d t5, t5, 16 addi.d t6, t6, 16 addi.d t7, t7, 16 addi.d t8, t8, 16 addi.w t4, t4, -16 blt zero, t4, .width16_loop beqz t1, .width16_end vld vr0, t3, 0 vldx vr1, t3, a5 vldx vr2, t3, t2 vavgr.bu vr3, vr0, vr1 vavgr.bu vr4, vr1, vr2 vhaddw.hu.bu vr5, vr3, vr3 vhaddw.hu.bu vr6, vr4, vr4 vssrarni.bu.h vr6, vr5, 1 fst.d f6, t5, 0 vstelm.d vr6, t7, 0, 1 addi.d t3, t3, 1 vld vr0, t3, 0 vldx vr1, t3, a5 vldx vr2, t3, t2 vavgr.bu vr3, vr0, vr1 vavgr.bu vr4, vr1, vr2 vhaddw.hu.bu vr5, vr3, vr3 vhaddw.hu.bu vr6, vr4, vr4 vssrarni.bu.h vr6, vr5, 1 fst.d f6, t6, 0 vstelm.d vr6, t8, 0, 1 .width16_end: add.d a0, a0, t2 add.d a1, a1, a6 add.d a2, a2, a6 add.d a3, a3, a6 add.d a4, a4, a6 addi.w a7, a7, -1 blt zero, a7, .height_loop endfunc_x264 /* * void mc_chroma(uint8_t *p_dst_u, uint8_t *p_dst_v, * intptr_t i_dst_stride, * uint8_t *p_src, intptr_t i_src_stride, * int32_t m_vx, int32_t m_vy, * int32_t i_width, int32_t i_height) */ function_x264 mc_chroma_lsx MC_CHROMA_START andi a5, a5, 0x07 /* m_vx & 0x07 */ andi a6, a6, 0x07 /* m_vy & 0x07 */ li.d t8, 8 sub.d t1, t8, a5 // 8-d8x sub.d t2, t8, a6 // 8-d8y mul.d t3, t1, t2 // CA mul.d t4, a5, t2 // CB mul.d t5, t1, a6 // CC mul.d t6, a5, a6 // CD vreplgr2vr.b vr0, t3 vreplgr2vr.b vr1, t4 vreplgr2vr.b vr2, t5 vreplgr2vr.b vr3, t6 add.d t0, a3, a4 ldptr.w t1, sp, 0 /* i_height */ move t3, t0 addi.d t4, zero, 1 addi.d t5, zero, 3 addi.d t6, zero, 7 bge t6, a7, .ENDLOOP_W8 .LOOP_W8: vld vr4, a3, 0 vld vr5, t0, 0 vld vr6, a3, 2 vld vr7, t0, 2 vmulwev.h.bu vr8, vr4, vr0 vmulwod.h.bu vr9, vr4, vr0 vmulwev.h.bu vr10, vr5, vr2 vmulwod.h.bu vr11, vr5, vr2 vmaddwev.h.bu vr8, vr6, vr1 vmaddwod.h.bu vr9, vr6, vr1 vmaddwev.h.bu vr10, vr7, vr3 vmaddwod.h.bu vr11, vr7, vr3 vadd.h vr12, vr8, vr10 vadd.h vr13, vr9, vr11 vssrarni.bu.h vr13, vr12, 6 vstelm.d vr13, a0, 0, 0 vstelm.d vr13, a1, 0, 1 add.d a0, a0, a2 add.d a1, a1, a2 addi.d t1, t1, -1 move a3, t3 add.d t3, t3, a4 move t0, t3 blt zero, t1, .LOOP_W8 b .ENDLOOP_W2 .ENDLOOP_W8: bge t5, a7, .ENDLOOP_W4 .LOOP_W4: vld vr4, a3, 0 vld vr5, t0, 0 vld vr6, a3, 2 vld vr7, t0, 2 vmulwev.h.bu vr8, vr4, vr0 vmulwod.h.bu vr9, vr4, vr0 vmulwev.h.bu vr10, vr5, vr2 vmulwod.h.bu vr11, vr5, vr2 vmaddwev.h.bu vr8, vr6, vr1 vmaddwod.h.bu vr9, vr6, vr1 vmaddwev.h.bu vr10, vr7, vr3 vmaddwod.h.bu vr11, vr7, vr3 vadd.h vr12, vr8, vr10 vadd.h vr13, vr9, vr11 vssrarni.bu.h vr13, vr12, 6 vstelm.w vr13, a0, 0, 0 vstelm.w vr13, a1, 0, 2 add.d a0, a0, a2 add.d a1, a1, a2 move a3, t3 add.d t3, t3, a4 move t0, t3 addi.d t1, t1, -1 blt zero, t1, .LOOP_W4 b .ENDLOOP_W2 .ENDLOOP_W4: bge t4, a7, .ENDLOOP_W2 .LOOP_W2: vld vr4, a3, 0 vld vr5, t0, 0 vld vr6, a3, 2 vld vr7, t0, 2 vmulwev.h.bu vr8, vr4, vr0 vmulwod.h.bu vr9, vr4, vr0 vmulwev.h.bu vr10, vr5, vr2 vmulwod.h.bu vr11, vr5, vr2 vmaddwev.h.bu vr8, vr6, vr1 vmaddwod.h.bu vr9, vr6, vr1 vmaddwev.h.bu vr10, vr7, vr3 vmaddwod.h.bu vr11, vr7, vr3 vadd.h vr12, vr8, vr10 vadd.h vr13, vr9, vr11 vssrarni.bu.h vr13, vr12, 6 vstelm.h vr13, a0, 0, 0 vstelm.h vr13, a1, 0, 4 add.d a0, a0, a2 add.d a1, a1, a2 move a3, t3 add.d t3, t3, a4 move t0, t3 addi.d t1, t1, -1 blt zero, t1, .LOOP_W2 .ENDLOOP_W2: endfunc_x264 function_x264 pixel_avg_weight_w4_lsx addi.d t0, zero, 64 sub.d t0, t0, a6 vreplgr2vr.b vr0, a6 vreplgr2vr.b vr1, t0 vpackev.b vr8, vr1, vr0 .LOOP_AVG_WEIGHT_W4: fld.s f0, a2, 0 fldx.s f1, a2, a3 fld.s f2, a4, 0 fldx.s f3, a4, a5 vilvl.w vr0, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.b vr0, vr2, vr0 vmulwev.h.bu.b vr1, vr0, vr8 vmaddwod.h.bu.b vr1, vr0, vr8 vssrarni.bu.h vr1, vr1, 6 fst.s f1, a0, 0 add.d a0, a0, a1 vstelm.w vr1, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a4, a5, a4, 1 addi.w a7, a7, -2 bnez a7, .LOOP_AVG_WEIGHT_W4 endfunc_x264 function_x264 pixel_avg_w4_lsx .LOOP_AVG_W4: fld.s f0, a2, 0 fldx.s f1, a2, a3 fld.s f4, a4, 0 fldx.s f5, a4, a5 vilvl.w vr0, vr1, vr0 vilvl.w vr4, vr5, vr4 vavgr.bu vr0, vr0, vr4 fst.s f0, a0, 0 add.d a0, a0, a1 vstelm.w vr0, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a4, a5, a4, 1 addi.w a7, a7, -2 bnez a7, .LOOP_AVG_W4 endfunc_x264 function_x264 pixel_avg_weight_w8_lsx addi.d t0, zero, 64 sub.d t0, t0, a6 slli.d t5, a1, 1 add.d t6, a1, t5 add.d t7, a1, t6 vreplgr2vr.b vr0, a6 vreplgr2vr.b vr1, t0 vpackev.b vr8, vr1, vr0 PIXEL_AVG_START_W8 .LOOP_AVG_HEIGHT_W8: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t0 fldx.d f3, a2, t1 fld.d f4, a4, 0 fldx.d f5, a4, a5 fldx.d f6, a4, t3 fldx.d f7, a4, t4 vilvl.b vr0, vr4, vr0 vilvl.b vr1, vr5, vr1 vilvl.b vr2, vr6, vr2 vilvl.b vr3, vr7, vr3 vmulwev.h.bu.b vr4, vr0, vr8 vmulwev.h.bu.b vr5, vr1, vr8 vmulwev.h.bu.b vr6, vr2, vr8 vmulwev.h.bu.b vr7, vr3, vr8 vmaddwod.h.bu.b vr4, vr0, vr8 vmaddwod.h.bu.b vr5, vr1, vr8 vmaddwod.h.bu.b vr6, vr2, vr8 vmaddwod.h.bu.b vr7, vr3, vr8 vssrarni.bu.h vr4, vr4, 6 vssrarni.bu.h vr5, vr5, 6 vssrarni.bu.h vr6, vr6, 6 vssrarni.bu.h vr7, vr7, 6 fst.d f4, a0, 0 fstx.d f5, a0, a1 fstx.d f6, a0, t5 fstx.d f7, a0, t6 add.d a0, a0, t7 alsl.d a2, a3, a2, 2 alsl.d a4, a5, a4, 2 addi.w a7, a7, -4 bnez a7, .LOOP_AVG_HEIGHT_W8 endfunc_x264 function_x264 pixel_avg_w8_lsx PIXEL_AVG_START_W8 .LOOP_AVG_W8: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t0 fldx.d f3, a2, t1 fld.d f4, a4, 0 fldx.d f5, a4, a5 fldx.d f6, a4, t3 fldx.d f7, a4, t4 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vavgr.bu vr0, vr0, vr4 vavgr.bu vr2, vr2, vr6 fst.d f0, a0, 0 add.d a0, a0, a1 vstelm.d vr0, a0, 0, 1 fstx.d f2, a0, a1 alsl.d a0, a1, a0, 1 vstelm.d vr2, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 2 alsl.d a4, a5, a4, 2 addi.w a7, a7, -4 bnez a7, .LOOP_AVG_W8 endfunc_x264 function_x264 pixel_avg_weight_w16_lsx addi.d t0, zero, 64 sub.d t0, t0, a6 vreplgr2vr.b vr8, a6 vreplgr2vr.b vr9, t0 PIXEL_AVG_START .LOOP_AVG_HEIGHT_W16: LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3 LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7 vmulwev.h.bu.b vr10, vr0, vr8 vmulwev.h.bu.b vr11, vr1, vr8 vmulwev.h.bu.b vr12, vr2, vr8 vmulwev.h.bu.b vr13, vr3, vr8 vmulwod.h.bu.b vr14, vr0, vr8 vmulwod.h.bu.b vr15, vr1, vr8 vmulwod.h.bu.b vr16, vr2, vr8 vmulwod.h.bu.b vr17, vr3, vr8 vmaddwev.h.bu.b vr10, vr4, vr9 vmaddwev.h.bu.b vr11, vr5, vr9 vmaddwev.h.bu.b vr12, vr6, vr9 vmaddwev.h.bu.b vr13, vr7, vr9 vmaddwod.h.bu.b vr14, vr4, vr9 vmaddwod.h.bu.b vr15, vr5, vr9 vmaddwod.h.bu.b vr16, vr6, vr9 vmaddwod.h.bu.b vr17, vr7, vr9 vssrarni.bu.h vr11, vr10, 6 vssrarni.bu.h vr13, vr12, 6 vssrarni.bu.h vr15, vr14, 6 vssrarni.bu.h vr17, vr16, 6 vilvl.b vr10, vr15, vr11 vilvh.b vr11, vr15, vr11 vilvl.b vr12, vr17, vr13 vilvh.b vr13, vr17, vr13 vst vr10, a0, 0 vstx vr11, a0, a1 vstx vr12, a0, t6 vstx vr13, a0, t7 add.d a2, a2, t2 add.d a4, a4, t5 add.d a0, a0, t8 addi.d a7, a7, -4 bnez a7, .LOOP_AVG_HEIGHT_W16 endfunc_x264 function_x264 pixel_avg_w16_lsx PIXEL_AVG_START .LOOP_AVG_W16: vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t0 vldx vr3, a2, t1 vld vr4, a4, 0 vldx vr5, a4, a5 vldx vr6, a4, t3 vldx vr7, a4, t4 vavgr.bu vr0, vr0, vr4 vavgr.bu vr1, vr1, vr5 vavgr.bu vr2, vr2, vr6 vavgr.bu vr3, vr3, vr7 vst vr0, a0, 0 vstx vr1, a0, a1 vstx vr2, a0, t6 vstx vr3, a0, t7 add.d a0, a0, t8 add.d a2, a2, t2 add.d a4, a4, t5 vld vr0, a2, 0 vldx vr1, a2, a3 vldx vr2, a2, t0 vldx vr3, a2, t1 vld vr4, a4, 0 vldx vr5, a4, a5 vldx vr6, a4, t3 vldx vr7, a4, t4 vavgr.bu vr0, vr0, vr4 vavgr.bu vr1, vr1, vr5 vavgr.bu vr2, vr2, vr6 vavgr.bu vr3, vr3, vr7 vst vr0, a0, 0 vstx vr1, a0, a1 vstx vr2, a0, t6 vstx vr3, a0, t7 add.d a2, a2, t2 add.d a4, a4, t5 add.d a0, a0, t8 addi.d a7, a7, -8 bnez a7, .LOOP_AVG_W16 endfunc_x264 /* * void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, * pixel *src2, intptr_t src2_stride, int weight); */ .macro PIXEL_AVG_LSX w, h function_x264 pixel_avg_\w\()x\h\()_lsx addi.d t0, a6, -32 addi.d a7, zero, \h bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lsx b x264_8_pixel_avg_w\w\()_lsx endfunc_x264 .endm PIXEL_AVG_LSX 16, 16 PIXEL_AVG_LSX 16, 8 PIXEL_AVG_LSX 8, 16 PIXEL_AVG_LSX 8, 8 PIXEL_AVG_LSX 8, 4 PIXEL_AVG_LSX 4, 16 PIXEL_AVG_LSX 4, 8 PIXEL_AVG_LSX 4, 4 PIXEL_AVG_LSX 4, 2 function_x264 mc_weight_w20_noden_lsx vldrepl.b vr0, a4, 36 // scale vldrepl.h vr1, a4, 40 // offset .LOOP_WEIGHT_W20_NODEN: vld vr3, a2, 0 vld vr4, a2, 16 add.d a2, a2, a3 vld vr5, a2, 0 vld vr6, a2, 16 vilvl.w vr4, vr6, vr4 vmulwev.h.bu.b vr7, vr3, vr0 vmulwod.h.bu.b vr8, vr3, vr0 vmulwev.h.bu.b vr9, vr4, vr0 vmulwod.h.bu.b vr10, vr4, vr0 vmulwev.h.bu.b vr11, vr5, vr0 vmulwod.h.bu.b vr12, vr5, vr0 vadd.h vr7, vr7, vr1 vadd.h vr8, vr8, vr1 vadd.h vr9, vr9, vr1 vadd.h vr10, vr10, vr1 vadd.h vr11, vr11, vr1 vadd.h vr12, vr12, vr1 vssrani.bu.h vr11, vr7, 0 vssrani.bu.h vr12, vr8, 0 vssrani.bu.h vr9, vr9, 0 vssrani.bu.h vr10, vr10, 0 vilvl.b vr7, vr12, vr11 vilvl.b vr9, vr10, vr9 vilvh.b vr11, vr12, vr11 vst vr7, a0, 0 vstelm.w vr9, a0, 16, 0 add.d a0, a0, a1 vst vr11, a0, 0 vstelm.w vr9, a0, 16, 1 add.d a0, a0, a1 add.d a2, a2, a3 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W20_NODEN endfunc_x264 function_x264 mc_weight_w16_noden_lsx vldrepl.b vr0, a4, 36 // scale vldrepl.h vr1, a4, 40 // offset .LOOP_WEIGHT_W16_NODEN: vld vr3, a2, 0 vldx vr4, a2, a3 vmulwev.h.bu.b vr5, vr3, vr0 vmulwod.h.bu.b vr6, vr3, vr0 vmulwev.h.bu.b vr7, vr4, vr0 vmulwod.h.bu.b vr8, vr4, vr0 vadd.h vr5, vr5, vr1 vadd.h vr6, vr6, vr1 vadd.h vr7, vr7, vr1 vadd.h vr8, vr8, vr1 vssrani.bu.h vr7, vr5, 0 vssrani.bu.h vr8, vr6, 0 vilvl.b vr5, vr8, vr7 vilvh.b vr7, vr8, vr7 vst vr5, a0, 0 vstx vr7, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W16_NODEN endfunc_x264 function_x264 mc_weight_w8_noden_lsx vldrepl.b vr0, a4, 36 // scale vldrepl.h vr1, a4, 40 // offset .LOOP_WEIGHT_W8_NODEN: fld.d f3, a2, 0 fldx.d f4, a2, a3 vilvl.d vr3, vr4, vr3 vmulwev.h.bu.b vr5, vr3, vr0 vmulwod.h.bu.b vr6, vr3, vr0 vadd.h vr5, vr5, vr1 vadd.h vr6, vr6, vr1 vssrani.bu.h vr5, vr5, 0 vssrani.bu.h vr6, vr6, 0 vilvl.b vr7, vr6, vr5 vstelm.d vr7, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr7, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W8_NODEN endfunc_x264 function_x264 mc_weight_w4_noden_lsx vldrepl.h vr0, a4, 36 // scale vldrepl.h vr1, a4, 40 // offset .LOOP_WEIGHT_W4_NODEN: fld.s f3, a2, 0 fldx.s f4, a2, a3 vilvl.w vr3, vr4, vr3 vsllwil.hu.bu vr3, vr3, 0 vmul.h vr3, vr3, vr0 vadd.h vr3, vr3, vr1 vssrani.bu.h vr3, vr3, 0 vstelm.w vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W4_NODEN endfunc_x264 function_x264 mc_weight_w20_lsx vldrepl.h vr1, a4, 40 // offset vldrepl.b vr0, a4, 36 // scale vldrepl.h vr2, a4, 32 // denom vsll.h vr1, vr1, vr2 .LOOP_WEIGHT_W20: vld vr3, a2, 0 vld vr4, a2, 16 add.d a2, a2, a3 vld vr5, a2, 0 vld vr6, a2, 16 vilvl.w vr4, vr6, vr4 vmulwev.h.bu.b vr7, vr3, vr0 vmulwod.h.bu.b vr8, vr3, vr0 vmulwev.h.bu.b vr9, vr4, vr0 vmulwod.h.bu.b vr10, vr4, vr0 vmulwev.h.bu.b vr11, vr5, vr0 vmulwod.h.bu.b vr12, vr5, vr0 vsadd.h vr7, vr7, vr1 vsadd.h vr8, vr8, vr1 vsadd.h vr9, vr9, vr1 vsadd.h vr10, vr10, vr1 vsadd.h vr11, vr11, vr1 vsadd.h vr12, vr12, vr1 vssrarn.bu.h vr7, vr7, vr2 vssrarn.bu.h vr8, vr8, vr2 vssrarn.bu.h vr9, vr9, vr2 vssrarn.bu.h vr10, vr10, vr2 vssrarn.bu.h vr11, vr11, vr2 vssrarn.bu.h vr12, vr12, vr2 vilvl.b vr7, vr8, vr7 vilvl.b vr9, vr10, vr9 vilvl.b vr11, vr12, vr11 vst vr7, a0, 0 vstelm.w vr9, a0, 16, 0 add.d a0, a0, a1 vst vr11, a0, 0 vstelm.w vr9, a0, 16, 1 add.d a0, a0, a1 add.d a2, a2, a3 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W20 endfunc_x264 function_x264 mc_weight_w16_lsx vldrepl.h vr1, a4, 40 // offset vldrepl.b vr0, a4, 36 // scale vldrepl.h vr2, a4, 32 // denom vsll.h vr1, vr1, vr2 .LOOP_WEIGHT_W16: vld vr3, a2, 0 vldx vr4, a2, a3 vmulwev.h.bu.b vr5, vr3, vr0 vmulwod.h.bu.b vr6, vr3, vr0 vmulwev.h.bu.b vr7, vr4, vr0 vmulwod.h.bu.b vr8, vr4, vr0 vsadd.h vr5, vr5, vr1 vsadd.h vr6, vr6, vr1 vsadd.h vr7, vr7, vr1 vsadd.h vr8, vr8, vr1 vssrarn.bu.h vr5, vr5, vr2 vssrarn.bu.h vr6, vr6, vr2 vssrarn.bu.h vr7, vr7, vr2 vssrarn.bu.h vr8, vr8, vr2 vilvl.b vr5, vr6, vr5 vilvl.b vr7, vr8, vr7 vst vr5, a0, 0 vstx vr7, a0, a1 alsl.d a2, a3, a2, 1 alsl.d a0, a1, a0, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W16 endfunc_x264 function_x264 mc_weight_w8_lsx vldrepl.h vr1, a4, 40 // offset vldrepl.b vr0, a4, 36 // scale vldrepl.h vr2, a4, 32 // denom vsll.h vr1, vr1, vr2 .LOOP_WEIGHT_W8: fld.d f3, a2, 0 fldx.d f4, a2, a3 vilvl.d vr3, vr4, vr3 vmulwev.h.bu.b vr5, vr3, vr0 vmulwod.h.bu.b vr6, vr3, vr0 vsadd.h vr5, vr5, vr1 vsadd.h vr6, vr6, vr1 vssrarn.bu.h vr5, vr5, vr2 vssrarn.bu.h vr6, vr6, vr2 vilvl.b vr7, vr6, vr5 vstelm.d vr7, a0, 0, 0 add.d a0, a0, a1 vstelm.d vr7, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W8 endfunc_x264 function_x264 mc_weight_w4_lsx vldrepl.h vr1, a4, 40 // offset vldrepl.h vr0, a4, 36 // scale vldrepl.h vr2, a4, 32 // denom vsll.h vr1, vr1, vr2 .LOOP_WEIGHT_W4: fld.s f3, a2, 0 fldx.s f4, a2, a3 vilvl.w vr3, vr4, vr3 vsllwil.hu.bu vr3, vr3, 0 vmul.h vr3, vr3, vr0 vsadd.h vr3, vr3, vr1 vssrarn.bu.h vr3, vr3, vr2 vstelm.w vr3, a0, 0, 0 add.d a0, a0, a1 vstelm.w vr3, a0, 0, 1 add.d a0, a0, a1 alsl.d a2, a3, a2, 1 addi.w a5, a5, -2 blt zero, a5, .LOOP_WEIGHT_W4 endfunc_x264 /* * void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w4_lsx .LOOP_AVG2_W4: addi.d a5, a5, -2 fld.s f0, a2, 0 fld.s f1, a4, 0 fldx.s f2, a2, a3 fldx.s f3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 vavgr.bu vr0, vr0, vr1 vavgr.bu vr1, vr2, vr3 fst.s f0, a0, 0 fstx.s f1, a0, a1 alsl.d a0, a1, a0, 1 blt zero, a5, .LOOP_AVG2_W4 endfunc_x264 /* * void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w8_lsx .LOOP_AVG2_W8: addi.d a5, a5, -2 fld.d f0, a2, 0 fld.d f1, a4, 0 fldx.d f2, a2, a3 fldx.d f3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 vavgr.bu vr0, vr0, vr1 vavgr.bu vr1, vr2, vr3 fst.d f0, a0, 0 fstx.d f1, a0, a1 alsl.d a0, a1, a0, 1 blt zero, a5, .LOOP_AVG2_W8 endfunc_x264 /* * void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w16_lsx .LOOP_AVG2_W16: addi.d a5, a5, -2 vld vr0, a2, 0 vldx vr1, a2, a3 vld vr2, a4, 0 vldx vr3, a4, a3 alsl.d a2, a3, a2, 1 alsl.d a4, a3, a4, 1 vavgr.bu vr0, vr0, vr2 vavgr.bu vr1, vr1, vr3 vst vr0, a0, 0 vstx vr1, a0, a1 alsl.d a0, a1, a0, 1 blt zero, a5, .LOOP_AVG2_W16 endfunc_x264 /* * void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, * intptr_t i_src_stride, uint8_t *src2, int i_height) */ function_x264 pixel_avg2_w20_lsx .LOOP_AVG2_W20: addi.d a5, a5, -2 vld vr0, a2, 0 vld vr1, a2, 16 vld vr2, a4, 0 vld vr3, a4, 16 add.d a2, a2, a3 add.d a4, a4, a3 vld vr4, a2, 0 vld vr5, a2, 16 vld vr6, a4, 0 vld vr7, a4, 16 vavgr.bu vr0, vr0, vr2 vavgr.bu vr1, vr1, vr3 vavgr.bu vr4, vr4, vr6 vavgr.bu vr5, vr5, vr7 vst vr0, a0, 0 vstelm.w vr1, a0, 16, 0 add.d a0, a0, a1 vst vr4, a0, 0 vstelm.w vr5, a0, 16, 0 add.d a2, a2, a3 add.d a4, a4, a3 add.d a0, a0, a1 blt zero, a5, .LOOP_AVG2_W20 endfunc_x264 /* * void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride, * uint8_t *p_src, int32_t i_src_stride, * int32_t i_height ) */ function_x264 mc_copy_w16_lsx slli.d t0, a3, 1 add.d t1, t0, a3 slli.d t2, a1, 1 add.d t3, t2, a1 .LOOP_COPY_W16: vld vr1, a2, 0 vldx vr2, a2, a3 vldx vr3, a2, t0 vldx vr4, a2, t1 vst vr1, a0, 0 vstx vr2, a0, a1 vstx vr3, a0, t2 vstx vr4, a0, t3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 addi.w a4, a4, -4 blt zero, a4, .LOOP_COPY_W16 endfunc_x264 /* * void mc_copy_w8(uint8_t *p_dst, intptr_t i_dst_stride, * uint8_t *p_src, intptr_t i_src_stride, * int32_t i_height) */ function_x264 mc_copy_w8_lsx slli.d t0, a3, 1 add.d t1, t0, a3 slli.d t2, a1, 1 add.d t3, t2, a1 .LOOP_COPY_W8: fld.d f0, a2, 0 fldx.d f1, a2, a3 fldx.d f2, a2, t0 fldx.d f3, a2, t1 fst.d f0, a0, 0 fstx.d f1, a0, a1 fstx.d f2, a0, t2 fstx.d f3, a0, t3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 addi.w a4, a4, -4 blt zero, a4, .LOOP_COPY_W8 endfunc_x264 /* * void mc_copy_w4(uint8_t *p_dst, intptr_t i_dst_stride, * uint8_t *p_src, intptr_t i_src_stride, * int32_t i_height) */ function_x264 mc_copy_w4_lsx slli.d t0, a3, 1 add.d t1, t0, a3 slli.d t2, a1, 1 add.d t3, t2, a1 .LOOP_COPY_W4: fld.s f0, a2, 0 fldx.s f1, a2, a3 fldx.s f2, a2, t0 fldx.s f3, a2, t1 fst.s f0, a0, 0 fstx.s f1, a0, a1 fstx.s f2, a0, t2 fstx.s f3, a0, t3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 addi.w a4, a4, -4 blt zero, a4, .LOOP_COPY_W4 endfunc_x264 /* * void store_interleave_chroma(uint8_t *p_dst, intptr_t i_dst_stride, * uint8_t *p_src0, uint8_t *p_src1, * int32_t i_height) */ function_x264 store_interleave_chroma_lsx .loop_interleave_chroma: fld.d f0, a2, 0 fld.d f1, a3, 0 addi.d a2, a2, FDEC_STRIDE addi.d a3, a3, FDEC_STRIDE vilvl.b vr0, vr1, vr0 vst vr0, a0, 0 add.d a0, a0, a1 addi.w a4, a4, -1 blt zero, a4, .loop_interleave_chroma endfunc_x264 /* * void load_deinterleave_chroma_fenc(pixel *dst, pixel *src, * intptr_t i_src, int height) */ function_x264 load_deinterleave_chroma_fenc_lsx addi.d t0, a0, FENC_STRIDE/2 andi t1, a3, 1 sub.w t2, a3, t1 .loop_deinterleave_fenc: vld vr0, a1, 0 vldx vr1, a1, a2 vpickev.b vr2, vr1, vr0 vpickod.b vr3, vr1, vr0 fst.d f2, a0, 0 fst.d f3, t0, 0 vstelm.d vr2, a0, FENC_STRIDE, 1 vstelm.d vr3, t0, FENC_STRIDE, 1 addi.d a0, a0, FENC_STRIDE * 2 addi.d t0, t0, FENC_STRIDE * 2 alsl.d a1, a2, a1, 1 addi.w t2, t2, -2 blt zero, t2, .loop_deinterleave_fenc beqz t1, .loop_deinterleave_fenc_end vld vr0, a1, 0 vpickev.b vr1, vr0, vr0 vpickod.b vr2, vr0, vr0 fst.d f1, a0, 0 fst.d f2, t0, 0 .loop_deinterleave_fenc_end: endfunc_x264 /* * void load_deinterleave_chroma_fdec(pixel *dst, pixel *src, * intptr_t i_src, int height) */ function_x264 load_deinterleave_chroma_fdec_lsx addi.d t0, a0, FDEC_STRIDE/2 andi t1, a3, 1 sub.w t2, a3, t1 .loop_deinterleave_fdec: vld vr0, a1, 0 vldx vr1, a1, a2 vpickev.b vr2, vr1, vr0 vpickod.b vr3, vr1, vr0 fst.d f2, a0, 0 fst.d f3, t0, 0 vstelm.d vr2, a0, FDEC_STRIDE, 1 vstelm.d vr3, t0, FDEC_STRIDE, 1 addi.d a0, a0, FDEC_STRIDE * 2 addi.d t0, t0, FDEC_STRIDE * 2 alsl.d a1, a2, a1, 1 addi.w t2, t2, -2 blt zero, t2, .loop_deinterleave_fdec beqz t1, .loop_deinterleave_fdec_end vld vr0, a1, 0 vpickev.b vr1, vr0, vr0 vpickod.b vr2, vr0, vr0 fst.d f1, a0, 0 fst.d f2, t0, 0 .loop_deinterleave_fdec_end: endfunc_x264 /* * x264_plane_copy_interleave(pixel *dst, intptr_t i_dst, * pixel *srcu, intptr_t i_srcu, * pixel *srcv, intptr_t i_srcv, int w, int h) */ function_x264 plane_copy_interleave_core_lsx .loop_h: add.d t0, a0, zero add.d t2, a2, zero add.d t4, a4, zero add.d t6, a6, zero .loop_copy_interleavew16: vld vr0, t2, 0 vld vr1, t4, 0 vilvl.b vr2, vr1, vr0 vilvh.b vr3, vr1, vr0 vst vr2, t0, 0 vst vr3, t0, 16 addi.d t2, t2, 16 addi.d t4, t4, 16 addi.d t0, t0, 32 addi.w t6, t6, -16 blt zero, t6, .loop_copy_interleavew16 add.d a2, a2, a3 add.d a4, a4, a5 add.d a0, a0, a1 addi.w a7, a7, -1 blt zero, a7, .loop_h endfunc_x264 /* * void x264_plane_copy_deinterleave(pixel *dsta, intptr_t i_dsta, * pixel *dstb, intptr_t i_dstb, * pixel *src, intptr_t i_src, int w, int h) */ function_x264 plane_copy_deinterleave_lsx .LOOP_PLANE_COPY_H: add.d t0, a0, zero add.d t2, a2, zero add.d t4, a4, zero add.d t6, a6, zero .LOOP_PLANE_COPY_W16: vld vr0, t4, 0 vld vr1, t4, 16 vpickev.b vr2, vr1, vr0 vpickod.b vr3, vr1, vr0 vst vr2, t0, 0 vst vr3, t2, 0 addi.d t4, t4, 32 addi.d t0, t0, 16 addi.d t2, t2, 16 addi.w t6, t6, -16 blt zero, t6, .LOOP_PLANE_COPY_W16 add.d a2, a2, a3 add.d a4, a4, a5 add.d a0, a0, a1 addi.w a7, a7, -1 blt zero, a7, .LOOP_PLANE_COPY_H endfunc_x264 function_x264 plane_copy_deinterleave_lasx .LOOP_PLANE_COPY_H_LASX: add.d t0, a0, zero add.d t2, a2, zero add.d t4, a4, zero add.d t6, a6, zero .LOOP_PLANE_COPY_W32_LASX: xvld xr0, t4, 0 xvld xr1, t4, 32 xvpickev.b xr2, xr1, xr0 xvpickod.b xr3, xr1, xr0 xvpermi.d xr2, xr2, 0xd8 xvpermi.d xr3, xr3, 0xd8 xvst xr2, t0, 0 xvst xr3, t2, 0 addi.d t4, t4, 64 addi.d t0, t0, 32 addi.d t2, t2, 32 addi.w t6, t6, -32 blt zero, t6, .LOOP_PLANE_COPY_W32_LASX add.d a2, a2, a3 add.d a4, a4, a5 add.d a0, a0, a1 addi.w a7, a7, -1 blt zero, a7, .LOOP_PLANE_COPY_H_LASX endfunc_x264 /* * void prefetch_ref(uint8_t *pix, intptr_t stride, int32_t parity) */ function_x264 prefetch_ref_lsx addi.d a2, a2, -1 addi.d a0, a0, 64 and a2, a2, a1 alsl.d t1, a2, a0, 3 alsl.d a2, a1, a1, 1 preld 0, t1, 0 add.d t2, t1, a1 preld 0, t2, 0 add.d t2, t2, a1 preld 0, t2, 0 add.d t1, t1, a2 preld 0, t1, 0 alsl.d a0, a1, t2, 1 preld 0, a0, 0 add.d t1, a0, a1 preld 0, t1, 0 add.d t1, t1, a1 preld 0, t1, 0 add.d a0, a0, a2 preld 0, a0, 0 endfunc_x264 /* * void prefetch_fenc_422(uint8_t *pix_y, intptr_t stride_y, * uint8_t *pix_uv, intptr_t stride_uv, * int32_t mb_x) */ function_x264 prefetch_fenc_422_lsx andi t0, a4, 3 mul.d t0, t0, a1 andi a4, a4, 6 mul.d t1, a4, a3 addi.d a0, a0, 64 addi.d a2, a2, 64 alsl.d a0, t0, a0, 2 preld 0, a0, 0 add.d t2, a0, a1 preld 0, t2, 0 add.d a0, t2, a1 preld 0, a0, 0 add.d a0, a0, a1 preld 0, a0, 0 alsl.d a2, t1, a2, 2 preld 0, a2, 0 add.d t3, a2, a3 preld 0, t3, 0 add.d a2, t3, a3 preld 0, a2, 0 add.d a2, a2, a3 preld 0, a2, 0 endfunc_x264 /* * void prefetch_fenc_420(uint8_t *pix_y, intptr_t stride_y, * uint8_t *pix_uv, intptr_t stride_uv, * int32_t mb_x) */ function_x264 prefetch_fenc_420_lsx andi t0, a4, 3 mul.d t0, t0, a1 andi a4, a4, 6 mul.d t1, a4, a3 addi.d a0, a0, 64 addi.d a2, a2, 64 alsl.d a0, t0, a0, 2 preld 0, a0, 0 add.d t2, a0, a1 preld 0, t2, 0 add.d a0, t2, a1 preld 0, a0, 0 add.d a0, a0, a1 preld 0, a0, 0 alsl.d a2, t1, a2, 2 preld 0, a2, 0 add.d a2, a2, a3 preld 0, a2, 0 endfunc_x264 /* * void *memcpy_aligned(void *dst, const void *src, size_t n) */ function_x264 memcpy_aligned_lsx andi t0, a2, 16 beqz t0, 2f addi.d a2, a2, -16 vld vr0, a1, 0 vst vr0, a0, 0 addi.d a1, a1, 16 addi.d a0, a0, 16 2: andi t0, a2, 32 beqz t0, 3f addi.d a2, a2, -32 vld vr0, a1, 0 vld vr1, a1, 16 vst vr0, a0, 0 vst vr1, a0, 16 addi.d a1, a1, 32 addi.d a0, a0, 32 3: beqz a2, 5f 4: addi.d a2, a2, -64 vld vr0, a1, 48 vld vr1, a1, 32 vld vr2, a1, 16 vld vr3, a1, 0 vst vr0, a0, 48 vst vr1, a0, 32 vst vr2, a0, 16 vst vr3, a0, 0 addi.d a1, a1, 64 addi.d a0, a0, 64 blt zero, a2, 4b 5: endfunc_x264 /* * void memzero_aligned(void *p_dst, size_t n) */ function_x264 memzero_aligned_lsx vxor.v vr1, vr1, vr1 .loop_memzero: addi.d a1, a1, -128 vst vr1, a0, 0 vst vr1, a0, 16 vst vr1, a0, 32 vst vr1, a0, 48 vst vr1, a0, 64 vst vr1, a0, 80 vst vr1, a0, 96 vst vr1, a0, 112 addi.d a0, a0, 128 blt zero, a1, .loop_memzero endfunc_x264 .macro FILT_H_LSX s1, s2, s3 vsub.h \s1, \s1, \s2 vsrai.h \s1, \s1, 2 vsub.h \s1, \s1, \s2 vadd.h \s1, \s1, \s3 vsrai.h \s1, \s1, 2 vadd.h \s1, \s1, \s3 .endm //s1: s1.0, s2: s2.0, s3: s3.0, s4: s1.1 s5: s2.1 s6: s3.1 .macro FILT_C_LSX s1, s2, s3, s4, s5, s6 vaddi.bu vr17, vr23, 2 //vr24 vaddi.bu vr19, vr26, 1 //vr27 vaddi.bu vr18, vr26, 3 //vr29 vshuf.b vr1, \s2, \s4, vr23 vshuf.b vr2, \s2, \s4, vr17 vshuf.b vr3, \s5, \s2, vr18 vshuf.b vr4, \s5, \s2, vr19 vadd.h vr3, vr2, vr3 vshuf.b vr16, \s5, \s2, vr23 vshuf.b vr17, \s5, \s2, vr17 vshuf.b vr18, \s3, \s5, vr18 vshuf.b vr19, \s3, \s5, vr19 vadd.h vr18, vr17, vr18 vmov vr2, \s5 vmov \s1, \s3 vmov vr20, \s3 vmov \s4, \s6 vaddi.bu vr17, vr26, 5 //vr30 vshuf.b \s3, vr2, \s2, vr17 vshuf.b \s6, vr20, \s5, vr17 vadd.h vr4, vr4, \s2 vadd.h \s3, \s3, vr1 vadd.h vr19, vr19, \s5 vadd.h \s6, \s6, vr16 FILT_H_LSX \s3, vr3, vr4 FILT_H_LSX \s6, vr18, vr19 .endm .macro FILT_PACK_LSX s1, s2, s3 vmulwev.w.h vr16, \s1, \s3 vmulwev.w.h vr17, \s2, \s3 vsrarni.h.w vr17, vr16, 15 vmaxi.h vr17, vr17, 0 vsat.hu vr17, vr17, 7 vmulwod.w.h vr18, \s1, \s3 vmulwod.w.h vr19, \s2, \s3 vsrarni.h.w vr19, vr18, 15 vmaxi.h vr19, vr19, 0 vsat.hu vr19, vr19, 7 vpackev.b \s1, vr19, vr17 .endm //s1: s1.0, s2: s2.0, s3: s3.0, s4: s4.0 //s5: s1.1, s6: s2.1, s7: s3.1, s8: s4.1 .macro DO_FILT_C_LSX s1, s2, s3, s4, s5, s6, s7, s8 FILT_C_LSX \s1, \s2, \s3, \s5, \s6, \s7 FILT_C_LSX \s2, \s1, \s4, \s6, \s5, \s8 FILT_PACK_LSX \s3, \s4, vr15 FILT_PACK_LSX \s7, \s8, vr15 vilvl.d vr16, \s7, \s3 vilvh.d vr17, \s7, \s3 addi.d t3, a5, 16 vstx vr16, a5, a4 vstx vr17, t3, a4 .endm .macro DO_FILT_H_LSX s1, s2, s3, s4, s5, s6 vaddi.bu vr16, vr23, 2 //vr24 vaddi.bu vr17, vr23, 3 //vr25 vaddi.bu vr18, vr26, 1 //vr27 vaddi.bu vr19, vr26, 2 //vr28 vld vr3, t5, 0 vshuf.b vr1, \s2, \s4, vr16 vshuf.b vr2, \s2, \s4, vr17 vshuf.b vr4, \s5, \s2, vr26 vshuf.b vr5, \s5, \s2, vr18 vshuf.b vr6, \s5, \s2, vr19 vdp2.h.bu.b vr16, vr1, vr12 vdp2.h.bu.b vr17, vr2, vr12 vdp2.h.bu.b vr18, \s2, vr14 vdp2.h.bu.b vr19, vr4, vr14 vdp2.h.bu.b vr20, vr5, vr0 vdp2.h.bu.b vr21, vr6, vr0 vadd.h vr1, vr16, vr18 vadd.h vr2, vr17, vr19 vadd.h vr1, vr1, vr20 vadd.h vr2, vr2, vr21 FILT_PACK_LSX vr1, vr2, vr15 vshuf.b vr1, vr1, vr1, vr3 vstx vr1, a0, a4 vaddi.bu vr16, vr23, 2 //vr24 vaddi.bu vr17, vr23, 3 //vr25 vaddi.bu vr18, vr26, 1 //vr27 vaddi.bu vr19, vr26, 2 //vr28 vshuf.b vr1, \s5, \s2, vr16 vshuf.b vr2, \s5, \s2, vr17 vshuf.b vr4, \s3, \s5, vr26 vshuf.b vr5, \s3, \s5, vr18 vshuf.b vr6, \s3, \s5, vr19 vdp2.h.bu.b vr16, vr1, vr12 vdp2.h.bu.b vr17, vr2, vr12 vdp2.h.bu.b vr18, \s5, vr14 vdp2.h.bu.b vr19, vr4, vr14 vdp2.h.bu.b vr20, vr5, vr0 vdp2.h.bu.b vr21, vr6, vr0 vadd.h vr1, vr16, vr18 vadd.h vr2, vr17, vr19 vadd.h vr1, vr1, vr20 vadd.h vr2, vr2, vr21 FILT_PACK_LSX vr1, vr2, vr15 vshuf.b vr1, vr1, vr1, vr3 addi.d a0, a0, 16 vstx vr1, a0, a4 addi.d a0, a0, -16 vmov \s1, \s2 vmov \s2, \s3 vmov \s4, \s5 vmov \s5, \s6 .endm /* s3: temp, s4: UNUSED, s5: imm */ .macro DO_FILT_V0_LSX s1, s2, s3, s4, s5 alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */ alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */ vld vr1, a3, 0 vldx vr2, a3, a2 vld \s3, t2, 0 vld vr3, a1, 0 vldx \s1, a1, a2 vld \s2, t1, 0 vilvh.b vr16, vr2, vr1 vilvl.b vr17, vr2, vr1 vilvh.b vr18, \s2, \s1 vilvl.b vr19, \s2, \s1 vilvh.b vr20, \s3, vr3 vilvl.b vr21, \s3, vr3 vdp2.h.bu.b vr1, vr17, vr12 vdp2.h.bu.b vr4, vr16, vr12 vdp2.h.bu.b \s1, vr19, vr0 vdp2.h.bu.b vr2, vr18, vr0 vdp2.h.bu.b vr3, vr21, vr14 vdp2.h.bu.b \s2, vr20, vr14 vadd.h vr1, vr1, \s1 vadd.h vr4, vr4, vr2 vadd.h vr1, vr1, vr3 vadd.h vr4, vr4, \s2 vmov \s1, vr1 vmov \s2, vr4 addi.d a3, a3, 16 addi.d a1, a1, 16 FILT_PACK_LSX vr1, vr4, vr15 addi.d t3, a4, \s5 vstx vr1, t0, t3 .endm .macro DO_FILT_V1_LSX s1, s2, s3, s4, s5 vld vr1, a3, 0 vldx vr2, a3, a2 vld \s3, t2, 16 vld vr3, a1, 0 vldx \s1, a1, a2 vld \s2, t1, 16 vilvh.b vr16, vr2, vr1 vilvl.b vr17, vr2, vr1 vilvh.b vr18, \s2, \s1 vilvl.b vr19, \s2, \s1 vilvh.b vr20, \s3, vr3 vilvl.b vr21, \s3, vr3 vdp2.h.bu.b vr1, vr17, vr12 vdp2.h.bu.b vr4, vr16, vr12 vdp2.h.bu.b \s1, vr19, vr0 vdp2.h.bu.b vr2, vr18, vr0 vdp2.h.bu.b vr3, vr21, vr14 vdp2.h.bu.b \s2, vr20, vr14 vadd.h vr1, vr1, \s1 vadd.h vr4, vr4, vr2 vadd.h vr1, vr1, vr3 vadd.h vr4, vr4, \s2 vmov \s1, vr1 vmov \s2, vr4 addi.d a3, a3, 16 addi.d a1, a1, 16 FILT_PACK_LSX vr1, vr4, vr15 addi.d t3, a4, \s5 addi.d t3, t3, 16 vstx vr1, t0, t3 .endm /* * void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, * uint8_t *src, intptr_t stride, int width, int height ) */ function_x264 hpel_filter_lsx addi.d sp, sp, -64 fst.d f24, sp, 0 fst.d f25, sp, 8 fst.d f26, sp, 16 fst.d f27, sp, 24 fst.d f28, sp, 32 fst.d f29, sp, 40 fst.d f30, sp, 48 fst.d f31, sp, 56 move a7, a3 addi.d a5, a5, -32 move t0, a1 andi a7, a7, 31 sub.d a3, a3, a7 add.d a0, a0, a5 add.d t0, t0, a5 add.d a7, a7, a5 add.d a5, a5, a2 move a2, a4 sub.d a7, zero, a7 add.d a1, a3, a2 sub.d a3, a3, a2 sub.d a3, a3, a2 move a4, a7 la.local t1, filt_mul51 vld vr0, t1, 0 la.local t2, filt_mul15 vld vr12, t2, 0 la.local t3, filt_mul20 vld vr14, t3, 0 la.local t4, pw_1024 vld vr15, t4, 0 la.local t5, hpel_shuf la.local t2, shuf_12 vld vr23, t2, 0 la.local t3, shuf_1 vld vr26, t3, 0 vxor.v vr9, vr9, vr9 vxor.v vr10, vr10, vr10 vxor.v vr11, vr11, vr11 vxor.v vr13, vr13, vr13 .LOOPY_LSX: DO_FILT_V0_LSX vr24, vr25, vr31, vr12, 0 DO_FILT_V1_LSX vr8, vr7, vr22, vr12, 0 .LOOPX_LSX: DO_FILT_V0_LSX vr27, vr28, vr29, vr12, 32 DO_FILT_V1_LSX vr6, vr5, vr30, vr12, 32 .LSTX: vsrli.h vr15, vr15, 1 DO_FILT_C_LSX vr9, vr24, vr8, vr27, vr10, vr25, vr7, vr28 vadd.h vr15, vr15, vr15 vmov vr8, vr6 vmov vr7, vr5 DO_FILT_H_LSX vr11, vr31, vr29, vr13, vr22, vr30 addi.d a4, a4, 32 blt a4, zero, .LOOPX_LSX addi.d t1, a4, -32 blt t1, zero, .LSTX //setup regs for next y sub.d a4, a4, a7 sub.d a4, a4, a2 sub.d a1, a1, a4 sub.d a3, a3, a4 add.d a0, a0, a2 add.d t0, t0, a2 add.d a5, a5, a2 move a4, a7 addi.d a6, a6, -1 blt zero, a6, .LOOPY_LSX fld.d f24, sp, 0 fld.d f25, sp, 8 fld.d f26, sp, 16 fld.d f27, sp, 24 fld.d f28, sp, 32 fld.d f29, sp, 40 fld.d f30, sp, 48 fld.d f31, sp, 56 addi.d sp, sp, 64 endfunc_x264 /* * void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, * pixel *dstv, pixel *dstc, intptr_t src_stride, * intptr_t dst_stride, int width, int height) */ function_x264 frame_init_lowres_core_lsx addi.d t0, zero, 15 addi.d t1, zero, 7 addi.d t2, zero, 3 addi.d t3, zero, 1 ld.d t4, sp, 0 addi.d sp, sp, -16 st.d s0, sp, 0 st.d s1, sp, 8 slli.d s0, a5, 1 .LOOPH: bge zero, t4, .ENDLOOPH addi.d t4, t4, -1 add.d t5, a0, a5 add.d t7, t5, a5 move t6, a7 .LOOPW16: bge t0, t6, .LOOPW8 vld vr0, a0, 0 vld vr1, t5, 0 vld vr2, t7, 0 vld vr3, a0, 1 vld vr4, t5, 1 vld vr5, t7, 1 vld vr6, a0, 16 vld vr7, t5, 16 vld vr8, t7, 16 vld vr9, a0, 17 vld vr10, t5, 17 vld vr11, t7, 17 // Calculate dst0, dsth, dstv and dstc vavgr.bu vr12, vr0, vr1 vavgr.bu vr13, vr1, vr2 vavgr.bu vr14, vr3, vr4 vavgr.bu vr15, vr4, vr5 vavgr.bu vr16, vr6, vr7 vavgr.bu vr17, vr7, vr8 vavgr.bu vr18, vr9, vr10 vavgr.bu vr19, vr10, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vssrarni.bu.h vr13, vr12, 1 vssrarni.bu.h vr15, vr14, 1 vssrarni.bu.h vr17, vr16, 1 vssrarni.bu.h vr19, vr18, 1 vilvl.d vr12, vr17, vr13 vilvl.d vr14, vr19, vr15 vilvh.d vr13, vr17, vr13 vilvh.d vr15, vr19, vr15 vst vr12, a1, 0 vst vr14, a2, 0 vst vr13, a3, 0 vst vr15, a4, 0 addi.d a1, a1, 16 addi.d a2, a2, 16 addi.d a3, a3, 16 addi.d a4, a4, 16 addi.d a0, a0, 32 addi.d t5, t5, 32 addi.d t7, t7, 32 addi.d t6, t6, -16 b .LOOPW16 .LOOPW8: bge t1, t6, .LOOPW4 vld vr0, a0, 0 vld vr1, t5, 0 vld vr2, t7, 0 vld vr3, a0, 1 vld vr4, t5, 1 vld vr5, t7, 1 // Calculate dst0, dsth, dstv and dstc vavgr.bu vr12, vr0, vr1 vavgr.bu vr13, vr1, vr2 vavgr.bu vr14, vr3, vr4 vavgr.bu vr15, vr4, vr5 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vssrarni.bu.h vr13, vr12, 1 vssrarni.bu.h vr15, vr14, 1 vstelm.d vr13, a1, 0, 0 vstelm.d vr15, a2, 0, 0 vstelm.d vr13, a3, 0, 1 vstelm.d vr15, a4, 0, 1 addi.d a1, a1, 8 addi.d a2, a2, 8 addi.d a3, a3, 8 addi.d a4, a4, 8 addi.d a0, a0, 16 addi.d t5, t5, 16 addi.d t7, t7, 16 addi.d t6, t6, -8 b .LOOPW8 .LOOPW4: bge t2, t6, .LOOPW2 vld vr0, a0, 0 vld vr1, t5, 0 vld vr2, t7, 0 vld vr3, a0, 1 vld vr4, t5, 1 vld vr5, t7, 1 // Calculate dst0, dsth, dstv and dstc vavgr.bu vr12, vr0, vr1 vavgr.bu vr13, vr1, vr2 vavgr.bu vr14, vr3, vr4 vavgr.bu vr15, vr4, vr5 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vssrarni.bu.h vr13, vr12, 1 vssrarni.bu.h vr15, vr14, 1 vstelm.w vr13, a1, 0, 0 vstelm.w vr15, a2, 0, 0 vstelm.w vr13, a3, 0, 2 vstelm.w vr15, a4, 0, 2 addi.d a1, a1, 4 addi.d a2, a2, 4 addi.d a3, a3, 4 addi.d a4, a4, 4 addi.d a0, a0, 8 addi.d t5, t5, 8 addi.d t7, t7, 8 addi.d t6, t6, -4 b .LOOPW4 .LOOPW2: bge t3, t6, .LOOPW1 vld vr0, a0, 0 vld vr1, t5, 0 vld vr2, t7, 0 vld vr3, a0, 1 vld vr4, t5, 1 vld vr5, t7, 1 // Calculate dst0, dsth, dstv and dstc vavgr.bu vr12, vr0, vr1 vavgr.bu vr13, vr1, vr2 vavgr.bu vr14, vr3, vr4 vavgr.bu vr15, vr4, vr5 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vssrarni.bu.h vr13, vr12, 1 vssrarni.bu.h vr15, vr14, 1 vstelm.h vr13, a1, 0, 0 vstelm.h vr15, a2, 0, 0 vstelm.h vr13, a3, 0, 4 vstelm.h vr15, a4, 0, 4 addi.d a1, a1, 2 addi.d a2, a2, 2 addi.d a3, a3, 2 addi.d a4, a4, 2 addi.d a0, a0, 4 addi.d t5, t5, 4 addi.d t7, t7, 4 addi.d t6, t6, -2 b .LOOPW2 .LOOPW1: bge zero, t6, .ENDLOOPW1 vld vr0, a0, 0 vld vr1, t5, 0 vld vr2, t7, 0 vld vr3, a0, 1 vld vr4, t5, 1 vld vr5, t7, 1 // Calculate dst0, dsth, dstv and dstc vavgr.bu vr12, vr0, vr1 vavgr.bu vr13, vr1, vr2 vavgr.bu vr14, vr3, vr4 vavgr.bu vr15, vr4, vr5 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vssrarni.bu.h vr13, vr12, 1 vssrarni.bu.h vr15, vr14, 1 vstelm.b vr13, a1, 0, 0 vstelm.b vr15, a2, 0, 0 vstelm.b vr13, a3, 0, 8 vstelm.b vr15, a4, 0, 8 .ENDLOOPW1: sub.d s1, a7, t6 sub.d a0, a0, s1 sub.d a0, a0, s1 add.d a0, a0, s0 sub.d a1, a1, s1 add.d a1, a1, a6 sub.d a2, a2, s1 add.d a2, a2, a6 sub.d a3, a3, s1 add.d a3, a3, a6 sub.d a4, a4, s1 add.d a4, a4, a6 b .LOOPH .ENDLOOPH: ld.d s0, sp, 0 ld.d s1, sp, 8 addi.d sp, sp, 16 endfunc_x264 #endif /* !HIGH_BIT_DEPTH */ x264-master/common/loongarch/mc-c.c000066400000000000000000000351011502133446700173060ustar00rootroot00000000000000/***************************************************************************** * mc-c.c: loongarch motion compensation ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "mc.h" #if !HIGH_BIT_DEPTH #define MC_WEIGHT_LSX(func) \ static void (* mc##func##_wtab_lsx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \ { \ x264_mc_weight_w4##func##_lsx, \ x264_mc_weight_w4##func##_lsx, \ x264_mc_weight_w8##func##_lsx, \ x264_mc_weight_w16##func##_lsx, \ x264_mc_weight_w16##func##_lsx, \ x264_mc_weight_w20##func##_lsx, \ }; #define MC_WEIGHT(func) \ static void (* mc##func##_wtab_lasx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \ { \ x264_mc_weight_w4##func##_lasx, \ x264_mc_weight_w4##func##_lasx, \ x264_mc_weight_w8##func##_lasx, \ x264_mc_weight_w16##func##_lasx, \ x264_mc_weight_w16##func##_lasx, \ x264_mc_weight_w20##func##_lasx, \ }; #if !HIGH_BIT_DEPTH MC_WEIGHT_LSX() MC_WEIGHT_LSX(_noden) MC_WEIGHT() MC_WEIGHT(_noden) #endif static void weight_cache_lsx( x264_t *h, x264_weight_t *w ) { if ( w->i_denom >= 1) { w->weightfn = mc_wtab_lsx; } else w->weightfn = mc_noden_wtab_lsx; } static weight_fn_t mc_weight_wtab_lsx[6] = { x264_mc_weight_w4_lsx, x264_mc_weight_w4_lsx, x264_mc_weight_w8_lsx, x264_mc_weight_w16_lsx, x264_mc_weight_w16_lsx, x264_mc_weight_w20_lsx, }; static void (* const pixel_avg_wtab_lsx[6])(uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = { NULL, x264_pixel_avg2_w4_lsx, x264_pixel_avg2_w8_lsx, x264_pixel_avg2_w16_lsx, x264_pixel_avg2_w16_lsx, x264_pixel_avg2_w20_lsx, }; static void (* const mc_copy_wtab_lsx[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = { NULL, x264_mc_copy_w4_lsx, x264_mc_copy_w8_lsx, NULL, x264_mc_copy_w16_lsx, }; static void weight_cache_lasx( x264_t *h, x264_weight_t *w ) { if ( w->i_denom >= 1) { w->weightfn = mc_wtab_lasx; } else w->weightfn = mc_noden_wtab_lasx; } static weight_fn_t mc_weight_wtab_lasx[6] = { x264_mc_weight_w4_lasx, x264_mc_weight_w4_lasx, x264_mc_weight_w8_lasx, x264_mc_weight_w16_lasx, x264_mc_weight_w16_lasx, x264_mc_weight_w20_lasx, }; static void (* const pixel_avg_wtab_lasx[6])(uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = { NULL, x264_pixel_avg2_w4_lasx, x264_pixel_avg2_w8_lasx, x264_pixel_avg2_w16_lasx, x264_pixel_avg2_w16_lasx, x264_pixel_avg2_w20_lasx, }; static void (* const mc_copy_wtab_lasx[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = { NULL, x264_mc_copy_w4_lasx, x264_mc_copy_w8_lasx, NULL, x264_mc_copy_w16_lasx, }; static uint8_t *get_ref_lsx( uint8_t *p_dst, intptr_t *p_dst_stride, uint8_t *p_src[4], intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height, const x264_weight_t *pWeight ) { int32_t i_qpel_idx; int32_t i_offset; uint8_t *p_src1; int32_t r_vy = m_vy & 3; int32_t r_vx = m_vx & 3; int32_t width = i_width >> 2; i_qpel_idx = ( r_vy << 2 ) + r_vx; i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + ( 3 == r_vy ) * i_src_stride; if( i_qpel_idx & 5 ) { uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + i_offset + ( 3 == r_vx ); pixel_avg_wtab_lsx[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, p_src2, i_height ); if( pWeight->weightfn ) { pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height); } return p_dst; } else if ( pWeight->weightfn ) { pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height ); return p_dst; } else { *p_dst_stride = i_src_stride; return p_src1; } } static void mc_luma_lsx( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src[4], intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height, const x264_weight_t *pWeight ) { int32_t i_qpel_idx; int32_t i_offset; uint8_t *p_src1; i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + ( 3 == ( m_vy & 3 ) ) * i_src_stride; if( i_qpel_idx & 5 ) { uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + i_offset + ( 3 == ( m_vx & 3 ) ); pixel_avg_wtab_lsx[i_width >> 2]( p_dst, i_dst_stride, p_src1, i_src_stride, p_src2, i_height ); if( pWeight->weightfn ) { pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); } } else if( pWeight->weightfn ) { pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); } else { mc_copy_wtab_lsx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height ); } } PLANE_INTERLEAVE(lsx) PLANE_COPY_YUYV(32, lsx) #define x264_mc_chroma_lsx x264_template(mc_chroma_lsx) void x264_mc_chroma_lsx( uint8_t *p_dst_u, uint8_t *p_dst_v, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height ); static uint8_t *get_ref_lasx( uint8_t *p_dst, intptr_t *p_dst_stride, uint8_t *p_src[4], intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height, const x264_weight_t *pWeight ) { int32_t i_qpel_idx; int32_t i_offset; uint8_t *p_src1; int32_t r_vy = m_vy & 3; int32_t r_vx = m_vx & 3; int32_t width = i_width >> 2; i_qpel_idx = ( r_vy << 2 ) + r_vx; i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + ( 3 == r_vy ) * i_src_stride; if( i_qpel_idx & 5 ) { uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + i_offset + ( 3 == r_vx ); pixel_avg_wtab_lasx[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, p_src2, i_height ); if( pWeight->weightfn ) { pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height); } return p_dst; } else if ( pWeight->weightfn ) { pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height ); return p_dst; } else { *p_dst_stride = i_src_stride; return p_src1; } } static void mc_luma_lasx( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src[4], intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height, const x264_weight_t *pWeight ) { int32_t i_qpel_idx; int32_t i_offset; uint8_t *p_src1; i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + ( 3 == ( m_vy & 3 ) ) * i_src_stride; if( i_qpel_idx & 5 ) { uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + i_offset + ( 3 == ( m_vx & 3 ) ); pixel_avg_wtab_lasx[i_width >> 2]( p_dst, i_dst_stride, p_src1, i_src_stride, p_src2, i_height ); if( pWeight->weightfn ) { pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); } } else if( pWeight->weightfn ) { pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); } else { mc_copy_wtab_lasx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height ); } } PLANE_COPY_YUYV(64, lasx) #define x264_mc_chroma_lasx x264_template(mc_chroma_lasx) void x264_mc_chroma_lasx( uint8_t *p_dst_u, uint8_t *p_dst_v, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height ); #endif // !HIGH_BIT_DEPTH void x264_mc_init_loongarch( int32_t cpu, x264_mc_functions_t *pf ) { #if !HIGH_BIT_DEPTH if( cpu & X264_CPU_LSX ) { pf->mc_luma = mc_luma_lsx; pf->mc_chroma = x264_mc_chroma_lsx; pf->get_ref = get_ref_lsx; pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_lsx; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lsx; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lsx; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lsx; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lsx; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lsx; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lsx; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lsx; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lsx; pf->weight = mc_weight_wtab_lsx; pf->offsetadd = mc_weight_wtab_lsx; pf->offsetsub = mc_weight_wtab_lsx; pf->weight_cache = weight_cache_lsx; pf->copy_16x16_unaligned = x264_mc_copy_w16_lsx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lsx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lsx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lsx; pf->store_interleave_chroma = x264_store_interleave_chroma_lsx; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_lsx; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_lsx; pf->plane_copy_interleave = plane_copy_interleave_lsx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lsx; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lsx; pf->hpel_filter = x264_hpel_filter_lsx; pf->memcpy_aligned = x264_memcpy_aligned_lsx; pf->memzero_aligned = x264_memzero_aligned_lsx; pf->frame_init_lowres_core = x264_frame_init_lowres_core_lsx; pf->prefetch_fenc_420 = x264_prefetch_fenc_420_lsx; pf->prefetch_fenc_422 = x264_prefetch_fenc_422_lsx; pf->prefetch_ref = x264_prefetch_ref_lsx; } if( cpu & X264_CPU_LASX ) { pf->mc_luma = mc_luma_lasx; pf->mc_chroma = x264_mc_chroma_lasx; pf->get_ref = get_ref_lasx; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lasx; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lasx; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lasx; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lasx; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lasx; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lasx; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lasx; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lasx; pf->weight = mc_weight_wtab_lasx; pf->offsetadd = mc_weight_wtab_lasx; pf->offsetsub = mc_weight_wtab_lasx; pf->weight_cache = weight_cache_lasx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lasx; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lasx; pf->copy_16x16_unaligned = x264_mc_copy_w16_lasx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lasx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lasx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lasx; pf->hpel_filter = x264_hpel_filter_lasx; pf->memzero_aligned = x264_memzero_aligned_lasx; pf->frame_init_lowres_core = x264_frame_init_lowres_core_lasx; } #endif // !HIGH_BIT_DEPTH } x264-master/common/loongarch/mc.h000066400000000000000000000313501502133446700170750ustar00rootroot00000000000000/***************************************************************************** * mc.h: loongarch motion compensation ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_LOONGARCH_MC_H #define X264_LOONGARCH_MC_H #define x264_mc_init_loongarch x264_template(mc_init_loongarch) void x264_mc_init_loongarch( int cpu, x264_mc_functions_t *pf ); #define x264_pixel_avg_16x16_lsx x264_template(pixel_avg_16x16_lsx) void x264_pixel_avg_16x16_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_16x8_lsx x264_template(pixel_avg_16x8_lsx) void x264_pixel_avg_16x8_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x16_lsx x264_template(pixel_avg_8x16_lsx) void x264_pixel_avg_8x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x8_lsx x264_template(pixel_avg_8x8_lsx) void x264_pixel_avg_8x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x4_lsx x264_template(pixel_avg_8x4_lsx) void x264_pixel_avg_8x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x16_lsx x264_template(pixel_avg_4x16_lsx) void x264_pixel_avg_4x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x8_lsx x264_template(pixel_avg_4x8_lsx) void x264_pixel_avg_4x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x4_lsx x264_template(pixel_avg_4x4_lsx) void x264_pixel_avg_4x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x2_lsx x264_template(pixel_avg_4x2_lsx) void x264_pixel_avg_4x2_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg2_w4_lsx x264_template(pixel_avg2_w4_lsx) void x264_pixel_avg2_w4_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w8_lsx x264_template(pixel_avg2_w8_lsx) void x264_pixel_avg2_w8_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w16_lsx x264_template(pixel_avg2_w16_lsx) void x264_pixel_avg2_w16_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w20_lsx x264_template(pixel_avg2_w20_lsx) void x264_pixel_avg2_w20_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_mc_weight_w20_lsx x264_template(mc_weight_w20_lsx) void x264_mc_weight_w20_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w20_noden_lsx x264_template(mc_weight_w20_noden_lsx) void x264_mc_weight_w20_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w16_lsx x264_template(mc_weight_w16_lsx) void x264_mc_weight_w16_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w16_noden_lsx x264_template(mc_weight_w16_noden_lsx) void x264_mc_weight_w16_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w8_lsx x264_template(mc_weight_w8_lsx) void x264_mc_weight_w8_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w8_noden_lsx x264_template(mc_weight_w8_noden_lsx) void x264_mc_weight_w8_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w4_lsx x264_template(mc_weight_w4_lsx) void x264_mc_weight_w4_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w4_noden_lsx x264_template(mc_weight_w4_noden_lsx) void x264_mc_weight_w4_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_copy_w16_lsx x264_template(mc_copy_w16_lsx) void x264_mc_copy_w16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w8_lsx x264_template(mc_copy_w8_lsx) void x264_mc_copy_w8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w4_lsx x264_template(mc_copy_w4_lsx) void x264_mc_copy_w4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_store_interleave_chroma_lsx x264_template(store_interleave_chroma_lsx) void x264_store_interleave_chroma_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); #define x264_load_deinterleave_chroma_fenc_lsx x264_template(load_deinterleave_chroma_fenc_lsx) void x264_load_deinterleave_chroma_fenc_lsx( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_lsx x264_template(load_deinterleave_chroma_fdec_lsx) void x264_load_deinterleave_chroma_fdec_lsx( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_plane_copy_interleave_core_lsx x264_template(plane_copy_interleave_core_lsx) void x264_plane_copy_interleave_core_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_plane_copy_deinterleave_lsx x264_template(plane_copy_deinterleave_lsx) void x264_plane_copy_deinterleave_lsx( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx) void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); #define x264_prefetch_fenc_420_lsx x264_template(prefetch_fenc_420_lsx) void x264_prefetch_fenc_420_lsx( uint8_t *pix_y, intptr_t stride_y, uint8_t *pix_uv, intptr_t stride_uv, int32_t mb_x ); #define x264_prefetch_fenc_422_lsx x264_template(prefetch_fenc_422_lsx) void x264_prefetch_fenc_422_lsx( uint8_t *pix_y, intptr_t stride_y, uint8_t *pix_uv, intptr_t stride_uv, int32_t mb_x ); #define x264_prefetch_ref_lsx x264_template(prefetch_ref_lsx) void x264_prefetch_ref_lsx( uint8_t *pix, intptr_t stride, int32_t parity ); #define x264_memcpy_aligned_lsx x264_template(memcpy_aligned_lsx) void *x264_memcpy_aligned_lsx( void *dst, const void *src, size_t n ); #define x264_memzero_aligned_lsx x264_template(memzero_aligned_lsx) void x264_memzero_aligned_lsx( void *p_dst, size_t n ); #define x264_hpel_filter_lsx x264_template(hpel_filter_lsx) void x264_hpel_filter_lsx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * ); #define x264_frame_init_lowres_core_lsx x264_template(frame_init_lowres_core_lsx) void x264_frame_init_lowres_core_lsx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); #define x264_pixel_avg_16x8_lasx x264_template(pixel_avg_16x8_lasx) void x264_pixel_avg_16x8_lasx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x16_lasx x264_template(pixel_avg_8x16_lasx) void x264_pixel_avg_8x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x8_lasx x264_template(pixel_avg_8x8_lasx) void x264_pixel_avg_8x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_8x4_lasx x264_template(pixel_avg_8x4_lasx) void x264_pixel_avg_8x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x16_lasx x264_template(pixel_avg_4x16_lasx) void x264_pixel_avg_4x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x8_lasx x264_template(pixel_avg_4x8_lasx) void x264_pixel_avg_4x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x4_lasx x264_template(pixel_avg_4x4_lasx) void x264_pixel_avg_4x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg_4x2_lasx x264_template(pixel_avg_4x2_lasx) void x264_pixel_avg_4x2_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_pixel_avg2_w4_lasx x264_template(pixel_avg2_w4_lasx) void x264_pixel_avg2_w4_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w8_lasx x264_template(pixel_avg2_w8_lasx) void x264_pixel_avg2_w8_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w16_lasx x264_template(pixel_avg2_w16_lasx) void x264_pixel_avg2_w16_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_pixel_avg2_w20_lasx x264_template(pixel_avg2_w20_lasx) void x264_pixel_avg2_w20_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define x264_mc_weight_w20_lasx x264_template(mc_weight_w20_lasx) void x264_mc_weight_w20_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w20_noden_lasx x264_template(mc_weight_w20_noden_lasx) void x264_mc_weight_w20_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w16_lasx x264_template(mc_weight_w16_lasx) void x264_mc_weight_w16_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w16_noden_lasx x264_template(mc_weight_w16_noden_lasx) void x264_mc_weight_w16_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w8_lasx x264_template(mc_weight_w8_lasx) void x264_mc_weight_w8_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w8_noden_lasx x264_template(mc_weight_w8_noden_lasx) void x264_mc_weight_w8_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w4_lasx x264_template(mc_weight_w4_lasx) void x264_mc_weight_w4_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_weight_w4_noden_lasx x264_template(mc_weight_w4_noden_lasx) void x264_mc_weight_w4_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_copy_w16_lasx x264_template(mc_copy_w16_lasx) void x264_mc_copy_w16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w8_lasx x264_template(mc_copy_w8_lasx) void x264_mc_copy_w8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_mc_copy_w4_lasx x264_template(mc_copy_w4_lasx) void x264_mc_copy_w4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); #define x264_plane_copy_interleave_core_lasx x264_template(plane_copy_interleave_core_lasx) void x264_plane_copy_interleave_core_lasx( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx) void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); #define x264_memzero_aligned_lasx x264_template(memzero_aligned_lasx) void x264_memzero_aligned_lasx( void *p_dst, size_t n ); #define x264_hpel_filter_lasx x264_template(hpel_filter_lasx) void x264_hpel_filter_lasx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * ); #define x264_frame_init_lowres_core_lasx x264_template(frame_init_lowres_core_lasx) void x264_frame_init_lowres_core_lasx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); #endif x264-master/common/loongarch/pixel-a.S000066400000000000000000003722701502133446700200210ustar00rootroot00000000000000/***************************************************************************** * pixel-a.S: LoongArch pixel metrics ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Hecai Yuan * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" #if !HIGH_BIT_DEPTH const hmul_8p .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1 endconst const mask_ac4b .short 0, -1, 0, -1, -1, -1, -1, -1 .short 0, -1, 0, -1, -1, -1, -1, -1 endconst const mask_ac8 .short 0, -1, -1, -1, -1, -1, -1, -1 .short 0, -1, -1, -1, -1, -1, -1, -1 endconst .macro LOAD_INC_8x4W n1, n2, n3, n4, n5 vld $vr\n1, a0, 0 vldx $vr\n2, a0, a1 vldx $vr\n3, a0, t0 vldx $vr\n4, a0, t1 xvpermi.d xr18, $xr\n1, 0x05 xvpermi.d xr19, $xr\n2, 0x05 xvpermi.d xr20, $xr\n3, 0x05 xvpermi.d xr21, $xr\n4, 0x05 add.d a0, a0, t2 xvdp2.h.bu.b $xr\n1, xr18, $xr\n5 xvdp2.h.bu.b $xr\n2, xr19, $xr\n5 xvdp2.h.bu.b $xr\n3, xr20, $xr\n5 xvdp2.h.bu.b $xr\n4, xr21, $xr\n5 .endm .macro SUMSUB_BADC a, b, c, d xvadd.h \a, \a, \b xvadd.h \c, \c, \d xvadd.h \b, \b, \b xvadd.h \d, \d, \d xvsub.h \b, \b, \a xvsub.h \d, \d, \c .endm .macro HADAMARD4_V a, b, c, d SUMSUB_BADC \a, \b, \c, \d SUMSUB_BADC \a, \c, \b, \d .endm .macro HADAMARD_1 a, b, tmp xmov \tmp, \a xvpackod.h \a, \b, \a xvpackev.h \b, \b, \tmp xvadd.h \tmp, \a, \b xvsub.h \b, \b, \a xmov \a, \tmp .endm .macro HADAMARD_2 a, b, c xvpickod.w \c, \b, \a xvpickev.w \a, \b, \a xvadda.h \a, \a, xr17 xvadda.h \c, \c, xr17 xvmax.h \a, \a, \c .endm .macro HADAMARD_AC_WXH_LASX w, h function_x264 pixel_hadamard_ac_\w\()x\h\()_lasx add.d t0, a1, a1 add.d t1, a1, t0 add.d t2, t1, a1 xvxor.v xr17, xr17, xr17 move t4, ra bl x264_8_hadamard_ac_16x8_lasx .if \h == 16 xmov xr11, xr9 xmov xr10, xr8 bl x264_8_hadamard_ac_16x8_lasx xvadd.h xr9, xr9, xr11 xvadd.h xr8, xr8, xr10 .endif move ra, t4 xvhaddw.wu.hu xr8, xr8, xr8 xvhaddw.du.wu xr8, xr8, xr8 xvhaddw.qu.du xr8, xr8, xr8 xvpickve2gr.wu t0, xr8, 0 xvpickve2gr.wu t1, xr8, 4 add.d t0, t0, t1 xvhaddw.wu.hu xr9, xr9, xr9 xvhaddw.du.wu xr9, xr9, xr9 xvhaddw.qu.du xr9, xr9, xr9 xvpickve2gr.wu t1, xr9, 0 xvpickve2gr.wu t2, xr9, 4 add.d t1, t1, t2 srli.d t0, t0, 2 srli.d t1, t1, 1 slli.d t0, t0, 32 add.d a0, t0, t1 endfunc_x264 .endm function_x264 hadamard_ac_16x8_lasx /* Load intermediate variable */ la.local t3, hmul_8p xvld xr8, t3, 0 LOAD_INC_8x4W 0, 1, 2, 3, 8 HADAMARD4_V xr0, xr1, xr2, xr3 LOAD_INC_8x4W 4, 5, 6, 7, 8 HADAMARD4_V xr4, xr5, xr6, xr7 HADAMARD_1 xr0, xr1, xr8 HADAMARD_1 xr2, xr3, xr8 xmov xr18, xr1 HADAMARD_1 xr4, xr5, xr8 HADAMARD_1 xr6, xr7, xr8 xmov xr19, xr2 xmov xr20, xr3 xvadda.h xr1, xr0, xr4 xvsub.h xr21, xr4, xr0 xvadd.h xr0, xr4, xr0 la.local t3, mask_ac4b xvld xr8, t3, 0 xvand.v xr1, xr1, xr8 xvadda.h xr1, xr1, xr5 xvadda.h xr1, xr1, xr18 xvadda.h xr1, xr1, xr19 xvadda.h xr1, xr1, xr20 xvadda.h xr1, xr1, xr6 xvadda.h xr9, xr1, xr7 xvadd.h xr3, xr7, xr20 xvsub.h xr7, xr7, xr20 xvadd.h xr2, xr6, xr19 xvsub.h xr6, xr6, xr19 xvadd.h xr1, xr5, xr18 xvsub.h xr5, xr5, xr18 HADAMARD_2 xr3, xr7, xr18 HADAMARD_2 xr2, xr6, xr19 HADAMARD_2 xr1, xr5, xr20 xvpickod.w xr5, xr21, xr0 xvpickev.w xr0, xr21, xr0 xmov xr4, xr5 xvadd.h xr5, xr0, xr4 xvsub.h xr4, xr4, xr0 xvadd.h xr2, xr2, xr3 xvadd.h xr2, xr2, xr1 xvadd.h xr2, xr2, xr2 la.local t3, mask_ac8 xvld xr8, t3, 0 xvand.v xr0, xr5, xr8 xvadda.h xr2, xr2, xr4 xvadda.h xr8, xr2, xr0 endfunc_x264 HADAMARD_AC_WXH_LASX 16, 8 HADAMARD_AC_WXH_LASX 16, 16 /* uint64_t hadamard_ac_8x8_lasx(uint8_t *p_pix, * int32_t i_stride) */ function_x264 hadamard_ac_8x8_lasx /* Load intermediate variable */ slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a1, 2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 vilvl.d vr8, vr1, vr0 vilvl.d vr9, vr3, vr2 vilvl.d vr10, vr5, vr4 vilvl.d vr11, vr7, vr6 xvpermi.q xr8, xr10, 0x02 xvpermi.q xr9, xr11, 0x02 xvpickev.b xr12, xr9, xr8 xvpickod.b xr13, xr9, xr8 xvaddwev.h.bu xr8, xr12, xr13 xvaddwod.h.bu xr9, xr12, xr13 xvsubwev.h.bu xr10, xr12, xr13 xvsubwod.h.bu xr11, xr12, xr13 xvadd.h xr12, xr8, xr9 xvadd.h xr13, xr10, xr11 xvsub.h xr14, xr8, xr9 xvsub.h xr15, xr10, xr11 xvilvl.h xr8, xr13, xr12 xvilvh.h xr9, xr13, xr12 xvilvl.h xr10, xr15, xr14 xvilvh.h xr11, xr15, xr14 xvilvl.w xr12, xr10, xr8 xvilvh.w xr13, xr10, xr8 xvilvl.w xr14, xr11, xr9 xvilvh.w xr15, xr11, xr9 xvadd.h xr8, xr12, xr13 xvadd.h xr9, xr14, xr15 xvsub.h xr10, xr12, xr13 xvsub.h xr11, xr14, xr15 xvadd.h xr12, xr8, xr9 xvadd.h xr13, xr10, xr11 xvsub.h xr14, xr8, xr9 xvsub.h xr15, xr10, xr11 vpickve2gr.hu t3, vr12, 0 vpickve2gr.hu t4, vr12, 4 xvor.v xr16, xr12, xr12 xvpermi.q xr16, xr16, 0x31 vpickve2gr.hu t5, vr16, 0 vpickve2gr.hu t6, vr16, 4 add.d t3, t3, t4 add.d t5, t5, t6 add.d t3, t3, t5 xvadda.h xr16, xr12, xr13 xvadda.h xr18, xr14, xr15 xvadd.h xr16, xr16, xr18 xvpermi.d xr17, xr16, 0x4e xvadd.h xr18, xr16, xr17 xvhaddw.wu.hu xr18, xr18, xr18 xvhaddw.du.wu xr18, xr18, xr18 xvhaddw.qu.du xr18, xr18, xr18 xvpickve2gr.wu t4, xr18, 0 xvpackev.h xr8, xr13, xr12 xvpackev.h xr9, xr15, xr14 xvpackod.h xr10, xr13, xr12 xvpackod.h xr11, xr15, xr14 xvilvl.d xr12, xr9, xr8 xvilvh.d xr13, xr9, xr8 xvilvl.d xr14, xr11, xr10 xvilvh.d xr15, xr11, xr10 xvor.v xr16, xr12, xr12 xvor.v xr17, xr13, xr13 xvpermi.q xr12, xr14, 0x02 xvpermi.q xr13, xr14, 0x12 xvpermi.q xr16, xr15, 0x03 xvpermi.q xr17, xr15, 0x13 xvadd.h xr8, xr12, xr13 xvsub.h xr9, xr12, xr13 xvadd.h xr10, xr16, xr17 xvsub.h xr11, xr16, xr17 xvadd.h xr12, xr8, xr10 xvadd.h xr13, xr9, xr11 xvsub.h xr14, xr8, xr10 xvsub.h xr15, xr9, xr11 xvadda.h xr16, xr12, xr13 xvadda.h xr17, xr14, xr15 xvadd.h xr18, xr16, xr17 xvpermi.d xr19, xr18, 0x4e xvadd.d xr19, xr18, xr19 xvhaddw.wu.hu xr19, xr19, xr19 xvhaddw.du.wu xr19, xr19, xr19 xvhaddw.qu.du xr19, xr19, xr19 xvpickve2gr.wu t5, xr19, 0 sub.d t4, t4, t3 sub.d t5, t5, t3 slli.d t5, t5, 32 add.d a0, t5, t4 endfunc_x264 /* int x264_pixel_satd_16x16_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_16x16_lasx slli.d t2, a1, 1 slli.d t3, a3, 1 slli.d t4, a1, 2 slli.d t5, a3, 2 add.d t6, a1, t2 add.d t7, a3, t3 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3 add.d a0, a0, t4 LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15 xvpermi.q xr0, xr4, 0x02 xvpermi.q xr1, xr5, 0x02 xvpermi.q xr2, xr6, 0x02 xvpermi.q xr3, xr7, 0x02 xvpermi.q xr8, xr12, 0x02 xvpermi.q xr9, xr13, 0x02 xvpermi.q xr10, xr14, 0x02 xvpermi.q xr11, xr15, 0x02 // HADAMARD4 xvsubwev.h.bu xr4, xr0, xr8 xvsubwod.h.bu xr5, xr0, xr8 xvsubwev.h.bu xr6, xr1, xr9 xvsubwod.h.bu xr7, xr1, xr9 xvsubwev.h.bu xr8, xr2, xr10 xvsubwod.h.bu xr9, xr2, xr10 xvsubwev.h.bu xr12, xr3, xr11 xvsubwod.h.bu xr13, xr3, xr11 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr8, xr9 xvsub.h xr5, xr8, xr9 xvadd.h xr6, xr12, xr13 xvsub.h xr7, xr12, xr13 xvpackev.h xr8, xr5, xr4 xvpackod.h xr9, xr5, xr4 xvpackev.h xr10, xr7, xr6 xvpackod.h xr11, xr7, xr6 xvpackev.h xr4, xr1, xr0 xvpackod.h xr5, xr1, xr0 xvpackev.h xr6, xr3, xr2 xvpackod.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr8, xr9 xvsub.h xr5, xr8, xr9 xvadd.h xr6, xr10, xr11 xvsub.h xr7, xr10, xr11 xvilvl.h xr8, xr1, xr0 xvilvl.h xr9, xr3, xr2 xvilvl.h xr10, xr5, xr4 xvilvl.h xr11, xr7, xr6 xvilvh.h xr0, xr1, xr0 xvilvh.h xr1, xr3, xr2 xvilvh.h xr2, xr5, xr4 xvilvh.h xr3, xr7, xr6 xvadd.h xr4, xr8, xr9 xvadd.h xr6, xr10, xr11 xvsub.h xr5, xr8, xr9 xvsub.h xr7, xr10, xr11 xvadd.h xr8, xr4, xr6 xvadd.h xr9, xr5, xr7 xvsub.h xr10, xr4, xr6 xvsub.h xr11, xr5, xr7 xvadd.h xr4, xr0, xr1 xvadd.h xr6, xr2, xr3 xvsub.h xr5, xr0, xr1 xvsub.h xr7, xr2, xr3 xvadd.h xr0, xr4, xr6 xvadd.h xr1, xr5, xr7 xvsub.h xr2, xr4, xr6 xvsub.h xr3, xr5, xr7 xvadda.h xr8, xr8, xr9 xvadda.h xr9, xr10, xr11 xvadda.h xr0, xr0, xr1 xvadda.h xr1, xr2, xr3 xvadd.h xr8, xr8, xr9 xvadd.h xr0, xr0, xr1 xvadd.h xr16, xr0, xr8 add.d a0, a0, t4 add.d a2, a2, t5 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3 add.d a0, a0, t4 LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15 xvpermi.q xr0, xr4, 0x02 xvpermi.q xr1, xr5, 0x02 xvpermi.q xr2, xr6, 0x02 xvpermi.q xr3, xr7, 0x02 xvpermi.q xr8, xr12, 0x02 xvpermi.q xr9, xr13, 0x02 xvpermi.q xr10, xr14, 0x02 xvpermi.q xr11, xr15, 0x02 // HADAMARD4 xvsubwev.h.bu xr4, xr0, xr8 xvsubwod.h.bu xr5, xr0, xr8 xvsubwev.h.bu xr6, xr1, xr9 xvsubwod.h.bu xr7, xr1, xr9 xvsubwev.h.bu xr8, xr2, xr10 xvsubwod.h.bu xr9, xr2, xr10 xvsubwev.h.bu xr12, xr3, xr11 xvsubwod.h.bu xr13, xr3, xr11 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr8, xr9 xvsub.h xr5, xr8, xr9 xvadd.h xr6, xr12, xr13 xvsub.h xr7, xr12, xr13 xvpackev.h xr8, xr5, xr4 xvpackod.h xr9, xr5, xr4 xvpackev.h xr10, xr7, xr6 xvpackod.h xr11, xr7, xr6 xvpackev.h xr4, xr1, xr0 xvpackod.h xr5, xr1, xr0 xvpackev.h xr6, xr3, xr2 xvpackod.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr8, xr9 xvsub.h xr5, xr8, xr9 xvadd.h xr6, xr10, xr11 xvsub.h xr7, xr10, xr11 xvilvl.h xr8, xr1, xr0 xvilvl.h xr9, xr3, xr2 xvilvl.h xr10, xr5, xr4 xvilvl.h xr11, xr7, xr6 xvilvh.h xr0, xr1, xr0 xvilvh.h xr1, xr3, xr2 xvilvh.h xr2, xr5, xr4 xvilvh.h xr3, xr7, xr6 xvadd.h xr4, xr8, xr9 xvadd.h xr6, xr10, xr11 xvsub.h xr5, xr8, xr9 xvsub.h xr7, xr10, xr11 xvadd.h xr8, xr4, xr6 xvadd.h xr9, xr5, xr7 xvsub.h xr10, xr4, xr6 xvsub.h xr11, xr5, xr7 xvadd.h xr4, xr0, xr1 xvadd.h xr6, xr2, xr3 xvsub.h xr5, xr0, xr1 xvsub.h xr7, xr2, xr3 xvadd.h xr0, xr4, xr6 xvadd.h xr1, xr5, xr7 xvsub.h xr2, xr4, xr6 xvsub.h xr3, xr5, xr7 xvadda.h xr8, xr8, xr9 xvadda.h xr9, xr10, xr11 xvadda.h xr0, xr0, xr1 xvadda.h xr1, xr2, xr3 xvadd.h xr8, xr8, xr9 xvadd.h xr0, xr0, xr1 xvadd.h xr0, xr0, xr8 xvadd.h xr0, xr0, xr16 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.du.wu xr0, xr0, xr0 xvhaddw.qu.du xr0, xr0, xr0 xvpickve2gr.wu t0, xr0, 0 xvpickve2gr.wu t1, xr0, 4 add.w t0, t0, t1 srli.d a0, t0, 1 endfunc_x264 /* int x264_pixel_satd_16x8_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_16x8_lasx slli.d t2, a1, 1 slli.d t3, a3, 1 slli.d t4, t2, 1 slli.d t5, t3, 1 add.d t6, a1, t2 add.d t7, a3, t3 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3 add.d a0, a0, t4 LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15 xvpermi.q xr0, xr4, 0x02 xvpermi.q xr1, xr5, 0x02 xvpermi.q xr2, xr6, 0x02 xvpermi.q xr3, xr7, 0x02 xvpermi.q xr8, xr12, 0x02 xvpermi.q xr9, xr13, 0x02 xvpermi.q xr10, xr14, 0x02 xvpermi.q xr11, xr15, 0x02 // HADAMARD4 xvsubwev.h.bu xr4, xr0, xr8 xvsubwod.h.bu xr5, xr0, xr8 xvsubwev.h.bu xr6, xr1, xr9 xvsubwod.h.bu xr7, xr1, xr9 xvsubwev.h.bu xr8, xr2, xr10 xvsubwod.h.bu xr9, xr2, xr10 xvsubwev.h.bu xr12, xr3, xr11 xvsubwod.h.bu xr13, xr3, xr11 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr8, xr9 xvsub.h xr5, xr8, xr9 xvadd.h xr6, xr12, xr13 xvsub.h xr7, xr12, xr13 xvpackev.h xr8, xr5, xr4 xvpackod.h xr9, xr5, xr4 xvpackev.h xr10, xr7, xr6 xvpackod.h xr11, xr7, xr6 xvpackev.h xr4, xr1, xr0 xvpackod.h xr5, xr1, xr0 xvpackev.h xr6, xr3, xr2 xvpackod.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr8, xr9 xvsub.h xr5, xr8, xr9 xvadd.h xr6, xr10, xr11 xvsub.h xr7, xr10, xr11 xvilvl.h xr8, xr1, xr0 xvilvl.h xr9, xr3, xr2 xvilvl.h xr10, xr5, xr4 xvilvl.h xr11, xr7, xr6 xvilvh.h xr0, xr1, xr0 xvilvh.h xr1, xr3, xr2 xvilvh.h xr2, xr5, xr4 xvilvh.h xr3, xr7, xr6 xvadd.h xr4, xr8, xr9 xvadd.h xr6, xr10, xr11 xvsub.h xr5, xr8, xr9 xvsub.h xr7, xr10, xr11 xvadd.h xr8, xr4, xr6 xvadd.h xr9, xr5, xr7 xvsub.h xr10, xr4, xr6 xvsub.h xr11, xr5, xr7 xvadd.h xr4, xr0, xr1 xvadd.h xr6, xr2, xr3 xvsub.h xr5, xr0, xr1 xvsub.h xr7, xr2, xr3 xvadd.h xr0, xr4, xr6 xvadd.h xr1, xr5, xr7 xvsub.h xr2, xr4, xr6 xvsub.h xr3, xr5, xr7 xvadda.h xr8, xr8, xr9 xvadda.h xr9, xr10, xr11 xvadda.h xr0, xr0, xr1 xvadda.h xr1, xr2, xr3 xvadd.h xr8, xr8, xr9 xvadd.h xr0, xr0, xr1 xvadd.h xr0, xr0, xr8 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.du.wu xr0, xr0, xr0 xvhaddw.qu.du xr0, xr0, xr0 xvpickve2gr.wu t0, xr0, 0 xvpickve2gr.wu t1, xr0, 4 add.w t0, t0, t1 srli.d a0, t0, 1 endfunc_x264 /* int x264_pixel_satd_8x16_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_8x16_lasx slli.d t2, a1, 1 add.d t3, a1, t2 slli.d t4, a1, 2 slli.d t5, a3, 1 add.d t6, a3, t5 slli.d t7, a3, 2 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3 add.d a0, a0, t4 LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11 add.d a2, a2, t7 LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 xvpermi.q xr0, xr2, 0x02 xvpermi.q xr1, xr3, 0x02 vilvl.d vr2, vr9, vr8 vilvl.d vr3, vr11, vr10 vilvl.d vr4, vr13, vr12 vilvl.d vr5, vr15, vr14 xvpermi.q xr2, xr4, 0x02 xvpermi.q xr3, xr5, 0x02 // HADAMARD4 xvsubwev.h.bu xr4, xr0, xr2 xvsubwod.h.bu xr5, xr0, xr2 xvsubwev.h.bu xr6, xr1, xr3 xvsubwod.h.bu xr7, xr1, xr3 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvpackev.h xr4, xr1, xr0 xvpackod.h xr5, xr1, xr0 xvpackev.h xr6, xr3, xr2 xvpackod.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvilvl.h xr4, xr1, xr0 xvilvh.h xr5, xr1, xr0 xvilvl.h xr6, xr3, xr2 xvilvh.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr1, xr4, xr5 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr0, xr2 xvadd.h xr5, xr1, xr3 xvsub.h xr6, xr0, xr2 xvsub.h xr7, xr1, xr3 xvadda.h xr0, xr4, xr5 xvadda.h xr1, xr6, xr7 xvadd.h xr16, xr0, xr1 add.d a0, a0, t4 add.d a2, a2, t7 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3 add.d a0, a0, t4 LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11 add.d a2, a2, t7 LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 xvpermi.q xr0, xr2, 0x02 xvpermi.q xr1, xr3, 0x02 vilvl.d vr2, vr9, vr8 vilvl.d vr3, vr11, vr10 vilvl.d vr4, vr13, vr12 vilvl.d vr5, vr15, vr14 xvpermi.q xr2, xr4, 0x02 xvpermi.q xr3, xr5, 0x02 // HADAMARD4 xvsubwev.h.bu xr4, xr0, xr2 xvsubwod.h.bu xr5, xr0, xr2 xvsubwev.h.bu xr6, xr1, xr3 xvsubwod.h.bu xr7, xr1, xr3 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvpackev.h xr4, xr1, xr0 xvpackod.h xr5, xr1, xr0 xvpackev.h xr6, xr3, xr2 xvpackod.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvilvl.h xr4, xr1, xr0 xvilvh.h xr5, xr1, xr0 xvilvl.h xr6, xr3, xr2 xvilvh.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr1, xr4, xr5 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr0, xr2 xvadd.h xr5, xr1, xr3 xvsub.h xr6, xr0, xr2 xvsub.h xr7, xr1, xr3 xvadda.h xr0, xr4, xr5 xvadda.h xr1, xr6, xr7 xvadd.h xr0, xr0, xr1 xvadd.h xr0, xr0, xr16 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.du.wu xr0, xr0, xr0 xvhaddw.qu.du xr0, xr0, xr0 xvpickve2gr.wu t0, xr0, 0 xvpickve2gr.wu t1, xr0, 4 add.w t0, t0, t1 srli.d a0, t0, 1 endfunc_x264 /* int x264_pixel_satd_8x8_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_8x8_lasx slli.d t2, a1, 1 slli.d t5, a3, 1 add.d t3, a1, t2 add.d t6, a3, t5 slli.d t4, t2, 1 slli.d t7, t5, 1 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3 add.d a0, a0, t4 LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11 add.d a2, a2, t7 LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 xvpermi.q xr0, xr2, 0x02 xvpermi.q xr1, xr3, 0x02 vilvl.d vr2, vr9, vr8 vilvl.d vr3, vr11, vr10 vilvl.d vr4, vr13, vr12 vilvl.d vr5, vr15, vr14 xvpermi.q xr2, xr4, 0x02 xvpermi.q xr3, xr5, 0x02 // HADAMARD4 xvsubwev.h.bu xr4, xr0, xr2 xvsubwod.h.bu xr5, xr0, xr2 xvsubwev.h.bu xr6, xr1, xr3 xvsubwod.h.bu xr7, xr1, xr3 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvpackev.h xr4, xr1, xr0 xvpackod.h xr5, xr1, xr0 xvpackev.h xr6, xr3, xr2 xvpackod.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvsub.h xr1, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr3, xr6, xr7 xvilvl.h xr4, xr1, xr0 xvilvh.h xr5, xr1, xr0 xvilvl.h xr6, xr3, xr2 xvilvh.h xr7, xr3, xr2 xvadd.h xr0, xr4, xr5 xvadd.h xr2, xr6, xr7 xvsub.h xr1, xr4, xr5 xvsub.h xr3, xr6, xr7 xvadd.h xr4, xr0, xr2 xvadd.h xr5, xr1, xr3 xvsub.h xr6, xr0, xr2 xvsub.h xr7, xr1, xr3 xvadda.h xr0, xr4, xr5 xvadda.h xr1, xr6, xr7 xvadd.h xr0, xr0, xr1 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.du.wu xr0, xr0, xr0 xvhaddw.qu.du xr0, xr0, xr0 xvpickve2gr.wu t0, xr0, 0 xvpickve2gr.wu t1, xr0, 4 add.w t0, t0, t1 srli.d a0, t0, 1 endfunc_x264 /* int x264_pixel_satd_8x4_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_8x4_lasx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr9, xr11, xr13 xvsub.h xr10, xr11, xr13 xvpackev.d xr11, xr10, xr9 xvpackod.d xr12, xr10, xr9 xvadda.h xr11, xr11, xr12 xvhaddw.wu.hu xr11, xr11, xr11 xvhaddw.du.wu xr11, xr11, xr11 xvhaddw.qu.du xr11, xr11, xr11 xvpickve2gr.wu t4, xr11, 0 xvpickve2gr.wu t5, xr11, 4 add.d t4, t4, t5 srli.d a0, t4, 1 endfunc_x264 /* int x264_pixel_satd_4x16_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_4x16_lasx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr9, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr10, vr7, vr5 slli.d t0, a1, 2 slli.d t1, a3, 2 // Load data from pix1 and pix2 add.d a0, a0, t0 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 add.d a2, a2, t1 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr5, vr7, vr5 xvpermi.q xr1, xr9, 0x20 xvpermi.q xr5, xr10, 0x20 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* b0 + b1 */ xvsub.h xr12, xr9, xr10 /* b0 - b1 */ xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadda.h xr9, xr9, xr10 xvhaddw.wu.hu xr9, xr9, xr9 xvhaddw.du.wu xr9, xr9, xr9 xvhaddw.qu.du xr9, xr9, xr9 xvpickve2gr.wu t6, xr9, 0 xvpickve2gr.wu t7, xr9, 4 add.d t7, t6, t7 // Load data from pix1 and pix2 add.d a0, a0, t0 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 add.d a2, a2, t1 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr9, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr10, vr7, vr5 // Load data from pix1 and pix2 add.d a0, a0, t0 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 add.d a2, a2, t1 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr5, vr7, vr5 xvpermi.q xr1, xr9, 0x20 xvpermi.q xr5, xr10, 0x20 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* b0 + b1 */ xvsub.h xr12, xr9, xr10 /* b0 - b1 */ xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadda.h xr9, xr9, xr10 xvhaddw.wu.hu xr9, xr9, xr9 xvhaddw.du.wu xr9, xr9, xr9 xvhaddw.qu.du xr9, xr9, xr9 xvpickve2gr.wu t6, xr9, 0 xvpickve2gr.wu t5, xr9, 4 add.d t6, t5, t6 add.d t7, t6, t7 srli.d a0, t7, 1 endfunc_x264 /* int x264_pixel_satd_4x8_lasx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_satd_4x8_lasx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr9, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr10, vr7, vr5 slli.d t0, a1, 2 slli.d t1, a3, 2 add.d a0, a0, t0 add.d a2, a2, t1 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr5, vr7, vr5 xvpermi.q xr1, xr9, 0x20 xvpermi.q xr5, xr10, 0x20 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* b0 + b1 */ xvsub.h xr12, xr9, xr10 /* b0 - b1 */ xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadda.h xr9, xr9, xr10 xvhaddw.wu.hu xr9, xr9, xr9 xvhaddw.du.wu xr9, xr9, xr9 xvhaddw.qu.du xr9, xr9, xr9 xvpickve2gr.wu t6, xr9, 0 xvpickve2gr.wu t7, xr9, 4 add.d t6, t6, t7 srli.d a0, t6, 1 endfunc_x264 /* int x264_pixel_satd_4x4_lsx(pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2) */ .macro pixel_satd_4x4_lsx_core out vilvl.w vr1, vr2, vr1 vilvl.w vr3, vr4, vr3 vilvl.d vr1, vr3, vr1 vilvl.w vr5, vr6, vr5 vilvl.w vr7, vr8, vr7 vilvl.d vr5, vr7, vr5 vsubwev.h.bu vr9, vr1, vr5 vsubwod.h.bu vr10, vr1, vr5 vadd.h vr11, vr9, vr10 /* a0 + a1 */ vsub.h vr12, vr9, vr10 /* a0 - a1 */ vpackev.h vr9, vr12, vr11 vpackod.h vr10, vr12, vr11 vadd.h vr11, vr9, vr10 /* b0 + b1 */ vsub.h vr12, vr9, vr10 /* b0 - b1 */ vpackev.w vr9, vr12, vr11 vpackod.w vr10, vr12, vr11 vadd.h vr11, vr9, vr10 /* HADAMARD4 */ vsub.h vr12, vr9, vr10 vpackev.d vr9, vr12, vr11 vpackod.d vr10, vr12, vr11 vadd.h vr11, vr9, vr10 vsub.h vr12, vr9, vr10 vpackev.d vr9, vr12, vr11 vpackod.d vr10, vr12, vr11 vadda.h \out, vr9, vr10 .endm function_x264 pixel_satd_4x4_lsx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 // Load data from pix1 and pix2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr13 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu t5, vr13, 0 srli.d a0, t5, 1 endfunc_x264 /* * int pixel_ssd_16x16_lasx(const Pixel *pix1, intptr_t stride_pix1, * const Pixel *pix2, intptr_t stride_pix2) */ function_x264 pixel_ssd_16x16_lasx slli.d t0, a1, 1 add.d t1, a1, t0 add.d t2, a1, t1 slli.d t3, a3, 1 add.d t4, a3, t3 add.d t5, a3, t4 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr4, xr4 vext2xv.hu.bu xr5, xr5 vext2xv.hu.bu xr6, xr6 vext2xv.hu.bu xr7, xr7 vext2xv.hu.bu xr8, xr8 vext2xv.hu.bu xr9, xr9 vext2xv.hu.bu xr10, xr10 vext2xv.hu.bu xr11, xr11 vext2xv.hu.bu xr12, xr12 vext2xv.hu.bu xr13, xr13 vext2xv.hu.bu xr14, xr14 vext2xv.hu.bu xr15, xr15 // Calculate the square of the difference xvsub.h xr0, xr0, xr8 xvsub.h xr1, xr1, xr9 xvsub.h xr2, xr2, xr10 xvsub.h xr3, xr3, xr11 xvsub.h xr4, xr4, xr12 xvsub.h xr5, xr5, xr13 xvsub.h xr6, xr6, xr14 xvsub.h xr7, xr7, xr15 xvmul.h xr0, xr0, xr0 xvmul.h xr1, xr1, xr1 xvmul.h xr2, xr2, xr2 xvmul.h xr3, xr3, xr3 xvmul.h xr4, xr4, xr4 xvmul.h xr5, xr5, xr5 xvmul.h xr6, xr6, xr6 xvmul.h xr7, xr7, xr7 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.wu.hu xr1, xr1, xr1 xvhaddw.wu.hu xr2, xr2, xr2 xvhaddw.wu.hu xr3, xr3, xr3 xvhaddw.wu.hu xr4, xr4, xr4 xvhaddw.wu.hu xr5, xr5, xr5 xvhaddw.wu.hu xr6, xr6, xr6 xvhaddw.wu.hu xr7, xr7, xr7 xvadd.w xr16, xr0, xr1 xvadd.w xr17, xr2, xr3 xvadd.w xr18, xr4, xr5 xvadd.w xr19, xr6, xr7 xvadd.w xr16, xr16, xr17 xvadd.w xr18, xr18, xr19 xvadd.w xr16, xr16, xr18 // Load data from pix1 and pix2 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr4, xr4 vext2xv.hu.bu xr5, xr5 vext2xv.hu.bu xr6, xr6 vext2xv.hu.bu xr7, xr7 vext2xv.hu.bu xr8, xr8 vext2xv.hu.bu xr9, xr9 vext2xv.hu.bu xr10, xr10 vext2xv.hu.bu xr11, xr11 vext2xv.hu.bu xr12, xr12 vext2xv.hu.bu xr13, xr13 vext2xv.hu.bu xr14, xr14 vext2xv.hu.bu xr15, xr15 // Calculate the square of the difference xvsub.h xr0, xr0, xr8 xvsub.h xr1, xr1, xr9 xvsub.h xr2, xr2, xr10 xvsub.h xr3, xr3, xr11 xvsub.h xr4, xr4, xr12 xvsub.h xr5, xr5, xr13 xvsub.h xr6, xr6, xr14 xvsub.h xr7, xr7, xr15 xvmul.h xr0, xr0, xr0 xvmul.h xr1, xr1, xr1 xvmul.h xr2, xr2, xr2 xvmul.h xr3, xr3, xr3 xvmul.h xr4, xr4, xr4 xvmul.h xr5, xr5, xr5 xvmul.h xr6, xr6, xr6 xvmul.h xr7, xr7, xr7 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.wu.hu xr1, xr1, xr1 xvhaddw.wu.hu xr2, xr2, xr2 xvhaddw.wu.hu xr3, xr3, xr3 xvhaddw.wu.hu xr4, xr4, xr4 xvhaddw.wu.hu xr5, xr5, xr5 xvhaddw.wu.hu xr6, xr6, xr6 xvhaddw.wu.hu xr7, xr7, xr7 xvadd.w xr0, xr0, xr1 xvadd.w xr2, xr2, xr3 xvadd.w xr4, xr4, xr5 xvadd.w xr6, xr6, xr7 xvadd.w xr0, xr0, xr2 xvadd.w xr4, xr4, xr6 xvadd.w xr0, xr0, xr4 xvadd.w xr0, xr0, xr16 // Calculate the sum xvhaddw.d.w xr0, xr0, xr0 xvhaddw.q.d xr0, xr0, xr0 xvpickve2gr.w t2, xr0, 0 xvpickve2gr.w t3, xr0, 4 add.d a0, t2, t3 endfunc_x264 /* * int pixel_ssd_16x8_lasx(const Pixel *pix1, intptr_t stride_pix1, * const Pixel *pix2, intptr_t stride_pix2) */ function_x264 pixel_ssd_16x8_lasx slli.d t0, a1, 1 add.d t1, a1, t0 add.d t2, a1, t1 slli.d t3, a3, 1 add.d t4, a3, t3 add.d t5, a3, t4 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr4, xr4 vext2xv.hu.bu xr5, xr5 vext2xv.hu.bu xr6, xr6 vext2xv.hu.bu xr7, xr7 vext2xv.hu.bu xr8, xr8 vext2xv.hu.bu xr9, xr9 vext2xv.hu.bu xr10, xr10 vext2xv.hu.bu xr11, xr11 vext2xv.hu.bu xr12, xr12 vext2xv.hu.bu xr13, xr13 vext2xv.hu.bu xr14, xr14 vext2xv.hu.bu xr15, xr15 // Calculate the square of the difference xvsub.h xr0, xr0, xr8 xvsub.h xr1, xr1, xr9 xvsub.h xr2, xr2, xr10 xvsub.h xr3, xr3, xr11 xvsub.h xr4, xr4, xr12 xvsub.h xr5, xr5, xr13 xvsub.h xr6, xr6, xr14 xvsub.h xr7, xr7, xr15 xvmul.h xr0, xr0, xr0 xvmul.h xr1, xr1, xr1 xvmul.h xr2, xr2, xr2 xvmul.h xr3, xr3, xr3 xvmul.h xr4, xr4, xr4 xvmul.h xr5, xr5, xr5 xvmul.h xr6, xr6, xr6 xvmul.h xr7, xr7, xr7 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.wu.hu xr1, xr1, xr1 xvhaddw.wu.hu xr2, xr2, xr2 xvhaddw.wu.hu xr3, xr3, xr3 xvhaddw.wu.hu xr4, xr4, xr4 xvhaddw.wu.hu xr5, xr5, xr5 xvhaddw.wu.hu xr6, xr6, xr6 xvhaddw.wu.hu xr7, xr7, xr7 xvadd.w xr0, xr0, xr1 xvadd.w xr2, xr2, xr3 xvadd.w xr4, xr4, xr5 xvadd.w xr6, xr6, xr7 xvadd.w xr0, xr0, xr2 xvadd.w xr4, xr4, xr6 xvadd.w xr0, xr0, xr4 // Calculate the sum xvhaddw.d.w xr0, xr0, xr0 xvhaddw.q.d xr0, xr0, xr0 xvpickve2gr.w t2, xr0, 0 xvpickve2gr.w t3, xr0, 4 add.d a0, t2, t3 endfunc_x264 /* * int pixel_ssd_8x16_lasx(const Pixel *pix1, intptr_t stride_pix1, * const Pixel *pix2, intptr_t stride_pix2) */ function_x264 pixel_ssd_8x16_lasx slli.d t0, a1, 1 add.d t1, a1, t0 add.d t2, a1, t1 slli.d t3, a3, 1 add.d t4, a3, t3 add.d t5, a3, t4 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 vilvl.d vr0, vr4, vr0 vilvl.d vr1, vr5, vr1 vilvl.d vr2, vr6, vr2 vilvl.d vr3, vr7, vr3 vilvl.d vr8, vr12, vr8 vilvl.d vr9, vr13, vr9 vilvl.d vr10, vr14, vr10 vilvl.d vr11, vr15, vr11 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr8, xr8 vext2xv.hu.bu xr9, xr9 vext2xv.hu.bu xr10, xr10 vext2xv.hu.bu xr11, xr11 // Calculate the square of the difference xvsub.h xr0, xr0, xr8 xvsub.h xr1, xr1, xr9 xvsub.h xr2, xr2, xr10 xvsub.h xr3, xr3, xr11 xvmul.h xr0, xr0, xr0 xvmul.h xr1, xr1, xr1 xvmul.h xr2, xr2, xr2 xvmul.h xr3, xr3, xr3 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.wu.hu xr1, xr1, xr1 xvhaddw.wu.hu xr2, xr2, xr2 xvhaddw.wu.hu xr3, xr3, xr3 xvadd.w xr0, xr0, xr1 xvadd.w xr2, xr2, xr3 xvadd.w xr16, xr0, xr2 // Load data from pix1 and pix2 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 vilvl.d vr0, vr4, vr0 vilvl.d vr1, vr5, vr1 vilvl.d vr2, vr6, vr2 vilvl.d vr3, vr7, vr3 vilvl.d vr8, vr12, vr8 vilvl.d vr9, vr13, vr9 vilvl.d vr10, vr14, vr10 vilvl.d vr11, vr15, vr11 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr8, xr8 vext2xv.hu.bu xr9, xr9 vext2xv.hu.bu xr10, xr10 vext2xv.hu.bu xr11, xr11 // Calculate the square of the difference xvsub.h xr0, xr0, xr8 xvsub.h xr1, xr1, xr9 xvsub.h xr2, xr2, xr10 xvsub.h xr3, xr3, xr11 xvmul.h xr0, xr0, xr0 xvmul.h xr1, xr1, xr1 xvmul.h xr2, xr2, xr2 xvmul.h xr3, xr3, xr3 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.wu.hu xr1, xr1, xr1 xvhaddw.wu.hu xr2, xr2, xr2 xvhaddw.wu.hu xr3, xr3, xr3 xvadd.w xr0, xr0, xr1 xvadd.w xr2, xr2, xr3 xvadd.w xr0, xr0, xr2 xvadd.w xr0, xr0, xr16 // Calculate the sum xvhaddw.d.w xr0, xr0, xr0 xvhaddw.q.d xr0, xr0, xr0 xvpickve2gr.w t2, xr0, 0 xvpickve2gr.w t3, xr0, 4 add.d a0, t2, t3 endfunc_x264 /* * int pixel_ssd_8x8_lasx(const Pixel *pix1, intptr_t stride_pix1, * const Pixel *pix2, intptr_t stride_pix2) */ function_x264 pixel_ssd_8x8_lasx slli.d t0, a1, 1 add.d t1, a1, t0 add.d t2, a1, t1 slli.d t3, a3, 1 add.d t4, a3, t3 add.d t5, a3, t4 // Load data from pix1 and pix2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 add.d a0, a0, t2 LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 add.d a2, a2, t5 LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 vilvl.d vr0, vr4, vr0 vilvl.d vr1, vr5, vr1 vilvl.d vr2, vr6, vr2 vilvl.d vr3, vr7, vr3 vilvl.d vr8, vr12, vr8 vilvl.d vr9, vr13, vr9 vilvl.d vr10, vr14, vr10 vilvl.d vr11, vr15, vr11 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 vext2xv.hu.bu xr8, xr8 vext2xv.hu.bu xr9, xr9 vext2xv.hu.bu xr10, xr10 vext2xv.hu.bu xr11, xr11 // Calculate the square of the difference xvsub.h xr0, xr0, xr8 xvsub.h xr1, xr1, xr9 xvsub.h xr2, xr2, xr10 xvsub.h xr3, xr3, xr11 xvmul.h xr0, xr0, xr0 xvmul.h xr1, xr1, xr1 xvmul.h xr2, xr2, xr2 xvmul.h xr3, xr3, xr3 xvhaddw.wu.hu xr0, xr0, xr0 xvhaddw.wu.hu xr1, xr1, xr1 xvhaddw.wu.hu xr2, xr2, xr2 xvhaddw.wu.hu xr3, xr3, xr3 xvadd.w xr0, xr0, xr1 xvadd.w xr2, xr2, xr3 xvadd.w xr0, xr0, xr2 // Calculate the sum xvhaddw.d.w xr0, xr0, xr0 xvhaddw.q.d xr0, xr0, xr0 xvpickve2gr.w t2, xr0, 0 xvpickve2gr.w t3, xr0, 4 add.d a0, t2, t3 endfunc_x264 /* * int pixel_sa8d_16x16_lasx(const Pixel *pix1, intptr_t i_pix1, * const Pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_sa8d_16x16_lasx addi.d sp, sp, -8 fst.d f24, sp, 0 slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 slli.d t6, a1, 2 slli.d t7, a3, 2 slli.d t0, a1, 3 slli.d t1, a3, 3 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr15, xr11, xr13 xvsub.h xr16, xr11, xr13 add.d a0, a0, t6 add.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr9, xr11, xr13 xvsub.h xr10, xr11, xr13 xvadd.h xr17, xr15, xr9 xvadd.h xr18, xr16, xr10 xvsub.h xr19, xr15, xr9 xvsub.h xr20, xr16, xr10 xvadda.h xr17, xr17, xr18 xvadda.h xr19, xr19, xr20 xvadd.h xr21, xr17, xr19 add.d a0, a0, t6 add.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr15, xr11, xr13 xvsub.h xr16, xr11, xr13 add.d a0, a0, t6 add.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr9, xr11, xr13 xvsub.h xr10, xr11, xr13 xvadd.h xr17, xr15, xr9 xvadd.h xr18, xr16, xr10 xvsub.h xr19, xr15, xr9 xvsub.h xr20, xr16, xr10 xvadda.h xr17, xr17, xr18 xvadda.h xr19, xr19, xr20 xvadd.h xr22, xr17, xr19 sub.d a0, a0, t6 sub.d a2, a2, t7 addi.d a0, a0, 8 addi.d a2, a2, 8 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr15, xr11, xr13 xvsub.h xr16, xr11, xr13 add.d a0, a0, t6 add.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr9, xr11, xr13 xvsub.h xr10, xr11, xr13 xvadd.h xr17, xr15, xr9 xvadd.h xr18, xr16, xr10 xvsub.h xr19, xr15, xr9 xvsub.h xr20, xr16, xr10 xvadda.h xr17, xr17, xr18 xvadda.h xr19, xr19, xr20 xvadd.h xr23, xr17, xr19 sub.d a0, a0, t0 sub.d a2, a2, t1 sub.d a0, a0, t6 sub.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr15, xr11, xr13 xvsub.h xr16, xr11, xr13 add.d a0, a0, t6 add.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr12, 0x13 xvadd.h xr9, xr11, xr13 xvsub.h xr10, xr11, xr13 xvadd.h xr17, xr15, xr9 xvadd.h xr18, xr16, xr10 xvsub.h xr19, xr15, xr9 xvsub.h xr20, xr16, xr10 xvadda.h xr17, xr17, xr18 xvadda.h xr19, xr19, xr20 xvadd.h xr24, xr17, xr19 xvadd.h xr21, xr21, xr22 xvadd.h xr23, xr23, xr24 xvhaddw.wu.hu xr21, xr21, xr21 xvhaddw.wu.hu xr23, xr23, xr23 xvadd.w xr21, xr21, xr23 xvhaddw.du.wu xr21, xr21, xr21 xvhaddw.qu.du xr21, xr21, xr21 xvpickve2gr.du t4, xr21, 0 xvpickve2gr.du t5, xr21, 2 add.d t4, t4, t5 addi.d t4, t4, 2 srli.d a0, t4, 2 fld.d f24, sp, 0 addi.d sp, sp, 8 endfunc_x264 /* * int pixel_sa8d_8x8_lasx(const Pixel *pix1, intptr_t i_pix1, * const Pixel *pix2, intptr_t i_pix2) */ function_x264 pixel_sa8d_8x8_lasx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 slli.d t6, a1, 2 slli.d t7, a3, 2 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvor.v xr14, xr12, xr12 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr14, 0x13 xvadd.h xr15, xr11, xr13 xvsub.h xr16, xr11, xr13 add.d a0, a0, t6 add.d a2, a2, t7 // Load data from pix1 and pix2 FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 vilvl.d vr1, vr2, vr1 vilvl.d vr3, vr4, vr3 vilvl.d vr5, vr6, vr5 vilvl.d vr7, vr8, vr7 xvpermi.q xr1, xr3, 0x02 xvpermi.q xr5, xr7, 0x02 xvsubwev.h.bu xr9, xr1, xr5 xvsubwod.h.bu xr10, xr1, xr5 xvadd.h xr11, xr9, xr10 /* a0 + a1 */ xvsub.h xr12, xr9, xr10 /* a0 - a1 */ xvpackev.h xr9, xr12, xr11 xvpackod.h xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvpackev.w xr9, xr12, xr11 xvpackod.w xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 xvsub.h xr12, xr9, xr10 xvpackev.d xr9, xr12, xr11 xvpackod.d xr10, xr12, xr11 xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ xvsub.h xr12, xr9, xr10 xvor.v xr13, xr11, xr11 xvor.v xr14, xr12, xr12 xvpermi.q xr11, xr12, 0x02 xvpermi.q xr13, xr14, 0x13 xvadd.h xr9, xr11, xr13 xvsub.h xr10, xr11, xr13 xvadd.h xr17, xr15, xr9 xvadd.h xr18, xr16, xr10 xvsub.h xr19, xr15, xr9 xvsub.h xr20, xr16, xr10 xvadda.h xr17, xr17, xr18 xvadda.h xr19, xr19, xr20 xvadd.h xr17, xr17, xr19 xvhaddw.wu.hu xr17, xr17, xr17 xvhaddw.du.wu xr17, xr17, xr17 xvhaddw.qu.du xr17, xr17, xr17 xvpickve2gr.wu t4, xr17, 0 xvpickve2gr.wu t5, xr17, 4 add.d t4, t4, t5 addi.d t4, t4, 2 srli.d a0, t4, 2 endfunc_x264 .macro sse_diff_8width_lasx in0, in1 fld.d f0, \in0, 0 fld.d f1, \in0, FENC_STRIDE fld.d f2, \in0, FENC_STRIDE * 2 fld.d f3, \in0, FENC_STRIDE * 3 fld.d f4, \in1, 0 fld.d f5, \in1, FDEC_STRIDE fld.d f6, \in1, FDEC_STRIDE * 2 fld.d f7, \in1, FDEC_STRIDE * 3 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 xvpermi.q xr1, xr0, 0x20 xvpermi.q xr5, xr4, 0x20 xvilvl.b xr2, xr5, xr1 xvilvh.b xr6, xr5, xr1 xvhsubw.hu.bu xr3, xr2, xr2 xvhsubw.hu.bu xr4, xr6, xr6 xvdp2add.w.h xr8, xr3, xr3 xvdp2add.w.h xr8, xr4, xr4 xvadd.h xr9, xr9, xr3 xvadd.h xr9, xr9, xr4 .endm /* * int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2, * int32_t ssd[2] ) */ function_x264 pixel_var2_8x16_lasx add.d t0, a0, zero add.d t1, a1, zero xvxor.v xr8, xr8, xr8 xvxor.v xr9, xr9, xr9 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 xvhaddw.w.h xr9, xr9, xr9 xvhaddw.d.w xr9, xr9, xr9 xvhaddw.q.d xr9, xr9, xr9 xvpickve2gr.wu t2, xr9, 0 xvpickve2gr.wu t3, xr9, 4 add.w t2, t2, t3 xvhaddw.d.w xr8, xr8, xr8 xvhaddw.q.d xr8, xr8, xr8 xvpickve2gr.wu t3, xr8, 0 xvpickve2gr.wu t4, xr8, 4 add.w t3, t4, t3 st.w t3, a2, 0 mul.w t2, t2, t2 srai.w t2, t2, 7 sub.w t3, t3, t2 xvxor.v xr8, xr8, xr8 xvxor.v xr9, xr9, xr9 addi.d a0, t0, FENC_STRIDE / 2 addi.d a1, t1, FDEC_STRIDE / 2 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 xvhaddw.w.h xr9, xr9, xr9 xvhaddw.d.w xr9, xr9, xr9 xvhaddw.q.d xr9, xr9, xr9 xvpickve2gr.wu t4, xr9, 0 xvpickve2gr.wu t5, xr9, 4 add.w t4, t4, t5 xvhaddw.d.w xr8, xr8, xr8 xvhaddw.q.d xr8, xr8, xr8 xvpickve2gr.wu t5, xr8, 0 xvpickve2gr.wu t6, xr8, 4 add.w t5, t6, t5 st.w t5, a2, 4 mul.w t4, t4, t4 srai.w t4, t4, 7 sub.w t5, t5, t4 add.w a0, t3, t5 endfunc_x264 /* * int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2, * int32_t ssd[2] ) */ function_x264 pixel_var2_8x8_lasx add.d t0, a0, zero add.d t1, a1, zero xvxor.v xr8, xr8, xr8 xvxor.v xr9, xr9, xr9 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 xvhaddw.w.h xr9, xr9, xr9 xvhaddw.d.w xr9, xr9, xr9 xvhaddw.q.d xr9, xr9, xr9 xvpickve2gr.wu t2, xr9, 0 xvpickve2gr.wu t3, xr9, 4 add.w t2, t2, t3 xvhaddw.d.w xr8, xr8, xr8 xvhaddw.q.d xr8, xr8, xr8 xvpickve2gr.wu t3, xr8, 0 xvpickve2gr.wu t4, xr8, 4 add.w t3, t4, t3 st.w t3, a2, 0 mul.w t2, t2, t2 srai.w t2, t2, 6 sub.w t3, t3, t2 xvxor.v xr8, xr8, xr8 xvxor.v xr9, xr9, xr9 addi.d a0, t0, FENC_STRIDE / 2 addi.d a1, t1, FDEC_STRIDE / 2 sse_diff_8width_lasx a0, a1 addi.d a0, a0, FENC_STRIDE * 4 addi.d a1, a1, FDEC_STRIDE * 4 sse_diff_8width_lasx a0, a1 xvhaddw.w.h xr9, xr9, xr9 xvhaddw.d.w xr9, xr9, xr9 xvhaddw.q.d xr9, xr9, xr9 xvpickve2gr.wu t4, xr9, 0 xvpickve2gr.wu t5, xr9, 4 add.w t4, t4, t5 xvhaddw.d.w xr8, xr8, xr8 xvhaddw.q.d xr8, xr8, xr8 xvpickve2gr.wu t5, xr8, 0 xvpickve2gr.wu t6, xr8, 4 add.w t5, t6, t5 st.w t5, a2, 4 mul.w t4, t4, t4 srai.w t4, t4, 6 sub.w t5, t5, t4 add.w a0, t3, t5 endfunc_x264 /* * uint64_t x264_pixel_hadamard_ac_8x8( pixel *pix, intptr_t stride ) */ function_x264 hadamard_ac_8x8_lsx slli.d t0, a1, 1 add.d t1, t0, a1 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 alsl.d a0, a1, a0, 2 FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vpickev.b vr2, vr1, vr0 vpickod.b vr3, vr1, vr0 vaddwev.h.bu vr6, vr2, vr3 vaddwod.h.bu vr7, vr2, vr3 vsubwev.h.bu vr8, vr2, vr3 vsubwod.h.bu vr9, vr2, vr3 vadd.h vr10, vr6, vr7 vadd.h vr11, vr8, vr9 vsub.h vr12, vr6, vr7 vsub.h vr13, vr8, vr9 vilvl.h vr6, vr11, vr10 vilvh.h vr7, vr11, vr10 vilvl.h vr8, vr13, vr12 vilvh.h vr9, vr13, vr12 vilvl.w vr10, vr8, vr6 vilvh.w vr11, vr8, vr6 vilvl.w vr12, vr9, vr7 vilvh.w vr13, vr9, vr7 vadd.h vr6, vr10, vr11 vadd.h vr7, vr12, vr13 vsub.h vr8, vr10, vr11 vsub.h vr9, vr12, vr13 vadd.h vr10, vr6, vr7 vadd.h vr11, vr8, vr9 vsub.h vr12, vr6, vr7 vsub.h vr13, vr8, vr9 vpickev.b vr2, vr5, vr4 vpickod.b vr3, vr5, vr4 vaddwev.h.bu vr6, vr2, vr3 vaddwod.h.bu vr7, vr2, vr3 vsubwev.h.bu vr8, vr2, vr3 vsubwod.h.bu vr9, vr2, vr3 vadd.h vr14, vr6, vr7 vadd.h vr15, vr8, vr9 vsub.h vr16, vr6, vr7 vsub.h vr17, vr8, vr9 vilvl.h vr6, vr15, vr14 vilvh.h vr7, vr15, vr14 vilvl.h vr8, vr17, vr16 vilvh.h vr9, vr17, vr16 vilvl.w vr14, vr8, vr6 vilvh.w vr15, vr8, vr6 vilvl.w vr16, vr9, vr7 vilvh.w vr17, vr9, vr7 vadd.h vr6, vr14, vr15 vadd.h vr7, vr16, vr17 vsub.h vr8, vr14, vr15 vsub.h vr9, vr16, vr17 vadd.h vr14, vr6, vr7 vadd.h vr15, vr8, vr9 vsub.h vr16, vr6, vr7 vsub.h vr17, vr8, vr9 vadd.h vr18, vr10, vr14 vpickve2gr.hu t0, vr18, 0 vpickve2gr.hu t1, vr18, 4 add.d t1, t0, t1 // dc vadda.h vr4, vr11, vr10 vadda.h vr5, vr13, vr12 vadda.h vr6, vr15, vr14 vadda.h vr7, vr17, vr16 vadd.h vr4, vr5, vr4 vadd.h vr6, vr7, vr6 vadd.h vr4, vr4, vr6 vhaddw.wu.hu vr4, vr4, vr4 vhaddw.du.wu vr4, vr4, vr4 vhaddw.qu.du vr4, vr4, vr4 vpickve2gr.wu t0, vr4, 0 // sum4 vpackev.h vr0, vr11, vr10 vpackev.h vr1, vr13, vr12 vpackev.h vr2, vr15, vr14 vpackev.h vr3, vr17, vr16 vpackod.h vr4, vr11, vr10 vpackod.h vr5, vr13, vr12 vpackod.h vr6, vr15, vr14 vpackod.h vr7, vr17, vr16 vilvl.d vr10, vr1, vr0 vilvh.d vr11, vr1, vr0 vilvl.d vr12, vr3, vr2 vilvh.d vr13, vr3, vr2 vilvl.d vr14, vr5, vr4 vilvh.d vr15, vr5, vr4 vilvl.d vr16, vr7, vr6 vilvh.d vr17, vr7, vr6 vadd.h vr0, vr10, vr11 vadd.h vr1, vr12, vr13 vadd.h vr2, vr14, vr16 vadd.h vr3, vr15, vr17 vsub.h vr4, vr10, vr11 vsub.h vr5, vr12, vr13 vsub.h vr6, vr14, vr16 vsub.h vr7, vr15, vr17 vadd.h vr10, vr0, vr1 vadd.h vr11, vr2, vr3 vadd.h vr12, vr4, vr5 vadd.h vr13, vr6, vr7 vsub.h vr14, vr0, vr1 vsub.h vr15, vr2, vr3 vsub.h vr16, vr4, vr5 vsub.h vr17, vr6, vr7 vadda.h vr10, vr10, vr11 vadda.h vr11, vr12, vr13 vadda.h vr12, vr14, vr15 vadda.h vr13, vr16, vr17 vadd.h vr10, vr10, vr11 vadd.h vr11, vr12, vr13 vadd.h vr10, vr10, vr11 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.du.wu vr10, vr10, vr10 vhaddw.qu.du vr10, vr10, vr10 vpickve2gr.wu t2, vr10, 0 // sum8 sub.d t0, t0, t1 sub.d t2, t2, t1 slli.d t2, t2, 32 add.d a0, t2, t0 endfunc_x264 /* * int x264_pixel_satd_4x8( pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2 ) */ function_x264 pixel_satd_4x8_lsx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 // Load data from pix1 and pix2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr13 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr14 vadd.h vr13, vr14, vr13 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu t5, vr13, 0 srli.d a0, t5, 1 endfunc_x264 /* * int x264_pixel_satd_4x16( uint8_t *p_pix1, intptr_t i_stride, * uint8_t *p_pix2, intptr_t i_stride2 ) */ function_x264 pixel_satd_4x16_lsx slli.d t2, a1, 1 slli.d t3, a3, 1 add.d t4, a1, t2 add.d t5, a3, t3 // Load data from pix1 and pix2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr13 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr14 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr15 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 pixel_satd_4x4_lsx_core vr16 vadd.h vr13, vr14, vr13 vadd.h vr15, vr16, vr15 vadd.h vr13, vr15, vr13 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu t5, vr13, 0 srli.d a0, t5, 1 endfunc_x264 .macro pixel_satd_8x4_lsx_core out0, out1, out2, out3 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 vsubwev.h.bu vr4, vr0, vr2 vsubwod.h.bu vr5, vr0, vr2 vsubwev.h.bu vr6, vr1, vr3 vsubwod.h.bu vr7, vr1, vr3 vadd.h vr0, vr4, vr5 vsub.h vr1, vr4, vr5 vadd.h vr2, vr6, vr7 vsub.h vr3, vr6, vr7 vpackev.h vr4, vr1, vr0 vpackod.h vr5, vr1, vr0 vpackev.h vr6, vr3, vr2 vpackod.h vr7, vr3, vr2 vadd.h vr8, vr4, vr5 vsub.h vr9, vr4, vr5 vadd.h vr10, vr6, vr7 vsub.h vr11, vr6, vr7 vilvl.d vr4, vr9, vr8 vilvh.d vr5, vr9, vr8 vilvl.d vr6, vr11, vr10 vilvh.d vr7, vr11, vr10 vadd.h vr8, vr4, vr5 vsub.h vr9, vr4, vr5 vadd.h vr10, vr6, vr7 vsub.h vr11, vr6, vr7 vadd.h \out0, vr8, vr10 vsub.h \out1, vr8, vr10 vadd.h \out2, vr9, vr11 vsub.h \out3, vr9, vr11 .endm /* * int x264_pixel_satd_8x4( uint8_t *p_pix1, intptr_t i_stride, * uint8_t *p_pix2, intptr_t i_stride2 ) */ function_x264 pixel_satd_8x4_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 vadda.h vr12, vr13, vr12 vadda.h vr13, vr15, vr14 vadd.h vr12, vr13, vr12 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.du.wu vr12, vr12, vr12 vhaddw.qu.du vr12, vr12, vr12 vpickve2gr.wu t4, vr12, 0 srli.d a0, t4, 1 endfunc_x264 /* * int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride, * uint8_t *p_pix2, intptr_t i_stride2 ) */ function_x264 pixel_satd_8x8_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 vadda.h vr12, vr13, vr12 vadda.h vr13, vr15, vr14 vadd.h vr12, vr13, vr12 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 vadda.h vr13, vr14, vr13 vadda.h vr14, vr16, vr15 vadd.h vr13, vr14, vr13 vadd.h vr12, vr13, vr12 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.du.wu vr12, vr12, vr12 vhaddw.qu.du vr12, vr12, vr12 vpickve2gr.wu t4, vr12, 0 srli.d a0, t4, 1 endfunc_x264 /* * int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride, * uint8_t *p_pix2, intptr_t i_stride2 ) */ function_x264 pixel_satd_8x16_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 vadda.h vr12, vr13, vr12 vadda.h vr13, vr15, vr14 vadd.h vr12, vr13, vr12 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 vadda.h vr13, vr14, vr13 vadda.h vr14, vr16, vr15 vadd.h vr13, vr14, vr13 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 vadda.h vr14, vr15, vr14 vadda.h vr15, vr17, vr16 vadd.h vr14, vr15, vr14 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 vadda.h vr15, vr16, vr15 vadda.h vr16, vr18, vr17 vadd.h vr15, vr16, vr15 vadd.h vr12, vr12, vr13 vadd.h vr14, vr14, vr15 vadd.h vr12, vr12, vr14 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.du.wu vr12, vr12, vr12 vhaddw.qu.du vr12, vr12, vr12 vpickve2gr.wu t4, vr12, 0 srli.d a0, t4, 1 endfunc_x264 /* * int x264_pixel_satd_16x8( uint8_t *p_pix1, intptr_t i_stride, * uint8_t *p_pix2, intptr_t i_stride2 ) */ function_x264 pixel_satd_16x8_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 vadda.h vr12, vr13, vr12 vadda.h vr13, vr15, vr14 vadd.h vr12, vr13, vr12 addi.d t5, a0, 8 addi.d t6, a2, 8 FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 vadda.h vr13, vr14, vr13 vadda.h vr14, vr16, vr15 vadd.h vr13, vr14, vr13 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 vadda.h vr14, vr15, vr14 vadda.h vr15, vr17, vr16 vadd.h vr14, vr15, vr14 addi.d t5, a0, 8 addi.d t6, a2, 8 FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 vadda.h vr15, vr16, vr15 vadda.h vr16, vr18, vr17 vadd.h vr15, vr16, vr15 vadd.h vr12, vr13, vr12 vadd.h vr14, vr15, vr14 vadd.h vr12, vr14, vr12 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.du.wu vr12, vr12, vr12 vhaddw.qu.du vr12, vr12, vr12 vpickve2gr.wu t4, vr12, 0 srli.d a0, t4, 1 endfunc_x264 /* * int x264_pixel_satd_16x16( uint8_t *p_pix1, intptr_t i_stride, * uint8_t *p_pix2, intptr_t i_stride2 ) */ function_x264 pixel_satd_16x16_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 vadda.h vr12, vr13, vr12 vadda.h vr13, vr15, vr14 vadd.h vr12, vr13, vr12 addi.d t5, a0, 8 addi.d t6, a2, 8 FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 vadda.h vr13, vr14, vr13 vadda.h vr14, vr16, vr15 vadd.h vr13, vr14, vr13 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 vadda.h vr14, vr15, vr14 vadda.h vr15, vr17, vr16 vadd.h vr14, vr15, vr14 addi.d t5, a0, 8 addi.d t6, a2, 8 FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 vadda.h vr15, vr16, vr15 vadda.h vr16, vr18, vr17 vadd.h vr15, vr16, vr15 vadd.h vr12, vr13, vr12 vadd.h vr14, vr15, vr14 vadd.h vr19, vr14, vr12 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 vadda.h vr12, vr13, vr12 vadda.h vr13, vr15, vr14 vadd.h vr12, vr13, vr12 addi.d t5, a0, 8 addi.d t6, a2, 8 FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 vadda.h vr13, vr14, vr13 vadda.h vr14, vr16, vr15 vadd.h vr13, vr14, vr13 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 vadda.h vr14, vr15, vr14 vadda.h vr15, vr17, vr16 vadd.h vr14, vr15, vr14 addi.d t5, a0, 8 addi.d t6, a2, 8 FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 vadda.h vr15, vr16, vr15 vadda.h vr16, vr18, vr17 vadd.h vr15, vr16, vr15 vadd.h vr12, vr13, vr12 vadd.h vr14, vr15, vr14 vadd.h vr12, vr14, vr12 vadd.h vr12, vr19, vr12 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.du.wu vr12, vr12, vr12 vhaddw.qu.du vr12, vr12, vr12 vpickve2gr.wu t4, vr12, 0 srli.d a0, t4, 1 endfunc_x264 /* * int x264_pixel_ssd_4x4( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_4x4_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr3, vr2 vilvl.w vr4, vr5, vr4 vilvl.w vr5, vr7, vr6 vilvl.d vr0, vr1, vr0 vilvl.d vr4, vr5, vr4 vsubwev.h.bu vr1, vr0, vr4 vsubwod.h.bu vr2, vr0, vr4 vmul.h vr5, vr1, vr1 vmul.h vr6, vr2, vr2 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vadd.w vr5, vr5, vr6 vhaddw.d.w vr5, vr5, vr5 vhaddw.q.d vr5, vr5, vr5 vpickve2gr.w a0, vr5, 0 endfunc_x264 /* * int x264_pixel_ssd_4x8( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_4x8_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr3, vr2 vilvl.w vr4, vr5, vr4 vilvl.w vr5, vr7, vr6 vilvl.d vr0, vr1, vr0 vilvl.d vr4, vr5, vr4 vsubwev.h.bu vr1, vr0, vr4 vsubwod.h.bu vr2, vr0, vr4 vmul.h vr5, vr1, vr1 vmul.h vr6, vr2, vr2 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vadd.w vr10, vr5, vr6 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr3, vr2 vilvl.w vr4, vr5, vr4 vilvl.w vr5, vr7, vr6 vilvl.d vr0, vr1, vr0 vilvl.d vr4, vr5, vr4 vsubwev.h.bu vr1, vr0, vr4 vsubwod.h.bu vr2, vr0, vr4 vmul.h vr5, vr1, vr1 vmul.h vr6, vr2, vr2 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vadd.w vr5, vr5, vr6 vadd.w vr5, vr5, vr10 vhaddw.d.w vr5, vr5, vr5 vhaddw.q.d vr5, vr5, vr5 vpickve2gr.w a0, vr5, 0 endfunc_x264 /* * int x264_pixel_ssd_4x16( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_4x16_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr3, vr2 vilvl.w vr4, vr5, vr4 vilvl.w vr5, vr7, vr6 vilvl.d vr0, vr1, vr0 vilvl.d vr4, vr5, vr4 vsubwev.h.bu vr1, vr0, vr4 vsubwod.h.bu vr2, vr0, vr4 vmul.h vr5, vr1, vr1 vmul.h vr6, vr2, vr2 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vadd.w vr10, vr5, vr6 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.w vr0, vr1, vr0 vilvl.w vr1, vr3, vr2 vilvl.w vr4, vr5, vr4 vilvl.w vr5, vr7, vr6 vilvl.d vr0, vr1, vr0 vilvl.d vr4, vr5, vr4 vsubwev.h.bu vr1, vr0, vr4 vsubwod.h.bu vr2, vr0, vr4 vmul.h vr5, vr1, vr1 vmul.h vr6, vr2, vr2 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vadd.w vr5, vr5, vr6 vadd.w vr10, vr5, vr10 .endr vhaddw.d.w vr10, vr10, vr10 vhaddw.q.d vr10, vr10, vr10 vpickve2gr.w a0, vr10, 0 endfunc_x264 /* * int x264_pixel_ssd_8x4( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_8x4_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vmul.h vr2, vr2, vr2 vmul.h vr3, vr3, vr3 vmul.h vr6, vr6, vr6 vmul.h vr7, vr7, vr7 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr2, vr2, vr6 vhaddw.d.w vr2, vr2, vr2 vhaddw.q.d vr2, vr2, vr2 vpickve2gr.w a0, vr2, 0 endfunc_x264 /* * int x264_pixel_ssd_8x8( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_8x8_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vmul.h vr2, vr2, vr2 vmul.h vr3, vr3, vr3 vmul.h vr6, vr6, vr6 vmul.h vr7, vr7, vr7 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr10, vr2, vr6 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vmul.h vr2, vr2, vr2 vmul.h vr3, vr3, vr3 vmul.h vr6, vr6, vr6 vmul.h vr7, vr7, vr7 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr11, vr2, vr6 vadd.w vr10, vr10, vr11 vhaddw.d.w vr10, vr10, vr10 vhaddw.q.d vr10, vr10, vr10 vpickve2gr.w a0, vr10, 0 endfunc_x264 /* * int x264_pixel_ssd_8x16( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_8x16_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vmul.h vr2, vr2, vr2 vmul.h vr3, vr3, vr3 vmul.h vr6, vr6, vr6 vmul.h vr7, vr7, vr7 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr10, vr2, vr6 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vmul.h vr2, vr2, vr2 vmul.h vr3, vr3, vr3 vmul.h vr6, vr6, vr6 vmul.h vr7, vr7, vr7 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr11, vr2, vr6 vadd.w vr10, vr10, vr11 .endr vhaddw.d.w vr10, vr10, vr10 vhaddw.q.d vr10, vr10, vr10 vpickve2gr.w a0, vr10, 0 endfunc_x264 /* * int x264_pixel_ssd_16x8( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_16x8_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 vsubwev.h.bu vr8, vr0, vr4 vsubwod.h.bu vr9, vr0, vr4 vsubwev.h.bu vr10, vr1, vr5 vsubwod.h.bu vr11, vr1, vr5 vsubwev.h.bu vr12, vr2, vr6 vsubwod.h.bu vr13, vr2, vr6 vsubwev.h.bu vr14, vr3, vr7 vsubwod.h.bu vr15, vr3, vr7 vmul.h vr8, vr8, vr8 vmul.h vr9, vr9, vr9 vmul.h vr10, vr10, vr10 vmul.h vr11, vr11, vr11 vmul.h vr12, vr12, vr12 vmul.h vr13, vr13, vr13 vmul.h vr14, vr14, vr14 vmul.h vr15, vr15, vr15 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.wu.hu vr14, vr14, vr14 vhaddw.wu.hu vr15, vr15, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr10, vr12, vr13 vadd.w vr11, vr14, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr16, vr8, vr9 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 vsubwev.h.bu vr8, vr0, vr4 vsubwod.h.bu vr9, vr0, vr4 vsubwev.h.bu vr10, vr1, vr5 vsubwod.h.bu vr11, vr1, vr5 vsubwev.h.bu vr12, vr2, vr6 vsubwod.h.bu vr13, vr2, vr6 vsubwev.h.bu vr14, vr3, vr7 vsubwod.h.bu vr15, vr3, vr7 vmul.h vr8, vr8, vr8 vmul.h vr9, vr9, vr9 vmul.h vr10, vr10, vr10 vmul.h vr11, vr11, vr11 vmul.h vr12, vr12, vr12 vmul.h vr13, vr13, vr13 vmul.h vr14, vr14, vr14 vmul.h vr15, vr15, vr15 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.wu.hu vr14, vr14, vr14 vhaddw.wu.hu vr15, vr15, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr10, vr12, vr13 vadd.w vr11, vr14, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr17, vr8, vr9 vadd.w vr10, vr16, vr17 vhaddw.d.w vr10, vr10, vr10 vhaddw.q.d vr10, vr10, vr10 vpickve2gr.w a0, vr10, 0 endfunc_x264 /* * int x264_pixel_ssd_16x16( pixel *pix1, intptr_t i_stride_pix1, * pixel *pix2, intptr_t i_stride_pix2 ) */ function_x264 pixel_ssd_16x16_lsx slli.d t0, a1, 1 add.d t1, a1, t0 slli.d t2, a3, 1 add.d t3, a3, t2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 vsubwev.h.bu vr8, vr0, vr4 vsubwod.h.bu vr9, vr0, vr4 vsubwev.h.bu vr10, vr1, vr5 vsubwod.h.bu vr11, vr1, vr5 vsubwev.h.bu vr12, vr2, vr6 vsubwod.h.bu vr13, vr2, vr6 vsubwev.h.bu vr14, vr3, vr7 vsubwod.h.bu vr15, vr3, vr7 vmul.h vr8, vr8, vr8 vmul.h vr9, vr9, vr9 vmul.h vr10, vr10, vr10 vmul.h vr11, vr11, vr11 vmul.h vr12, vr12, vr12 vmul.h vr13, vr13, vr13 vmul.h vr14, vr14, vr14 vmul.h vr15, vr15, vr15 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.wu.hu vr14, vr14, vr14 vhaddw.wu.hu vr15, vr15, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr10, vr12, vr13 vadd.w vr11, vr14, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr16, vr8, vr9 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 vsubwev.h.bu vr8, vr0, vr4 vsubwod.h.bu vr9, vr0, vr4 vsubwev.h.bu vr10, vr1, vr5 vsubwod.h.bu vr11, vr1, vr5 vsubwev.h.bu vr12, vr2, vr6 vsubwod.h.bu vr13, vr2, vr6 vsubwev.h.bu vr14, vr3, vr7 vsubwod.h.bu vr15, vr3, vr7 vmul.h vr8, vr8, vr8 vmul.h vr9, vr9, vr9 vmul.h vr10, vr10, vr10 vmul.h vr11, vr11, vr11 vmul.h vr12, vr12, vr12 vmul.h vr13, vr13, vr13 vmul.h vr14, vr14, vr14 vmul.h vr15, vr15, vr15 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.wu.hu vr12, vr12, vr12 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.wu.hu vr14, vr14, vr14 vhaddw.wu.hu vr15, vr15, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr10, vr12, vr13 vadd.w vr11, vr14, vr15 vadd.w vr8, vr8, vr9 vadd.w vr9, vr10, vr11 vadd.w vr17, vr8, vr9 vadd.w vr16, vr16, vr17 .endr vhaddw.d.w vr16, vr16, vr16 vhaddw.q.d vr16, vr16, vr16 vpickve2gr.w a0, vr16, 0 endfunc_x264 /* * int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) */ .macro pixel_sa8d_8x8_lsx_core out0, out1, out2, out3 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vadd.h vr8, vr2, vr3 vsub.h vr9, vr2, vr3 vadd.h vr10, vr6, vr7 vsub.h vr11, vr6, vr7 vpackev.h vr0, vr9, vr8 vpackod.h vr1, vr9, vr8 vpackev.h vr2, vr11, vr10 vpackod.h vr3, vr11, vr10 vadd.h vr4, vr0, vr1 vsub.h vr5, vr0, vr1 vadd.h vr6, vr2, vr3 vsub.h vr7, vr2, vr3 vilvl.d vr0, vr5, vr4 vilvh.d vr1, vr5, vr4 vilvl.d vr2, vr7, vr6 vilvh.d vr3, vr7, vr6 vadd.h vr12, vr0, vr1 vsub.h vr13, vr0, vr1 vadd.h vr14, vr2, vr3 vsub.h vr15, vr2, vr3 alsl.d t4, a1, a0, 2 alsl.d t5, a3, a2, 2 FLDD_LOADX_4 t4, a1, t0, t1, f0, f1, f2, f3 FLDD_LOADX_4 t5, a3, t2, t3, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vsubwev.h.bu vr2, vr0, vr4 vsubwod.h.bu vr3, vr0, vr4 vsubwev.h.bu vr6, vr1, vr5 vsubwod.h.bu vr7, vr1, vr5 vadd.h vr8, vr2, vr3 vsub.h vr9, vr2, vr3 vadd.h vr10, vr6, vr7 vsub.h vr11, vr6, vr7 vpackev.h vr0, vr9, vr8 vpackod.h vr1, vr9, vr8 vpackev.h vr2, vr11, vr10 vpackod.h vr3, vr11, vr10 vadd.h vr4, vr0, vr1 vsub.h vr5, vr0, vr1 vadd.h vr6, vr2, vr3 vsub.h vr7, vr2, vr3 vilvl.d vr0, vr5, vr4 vilvh.d vr1, vr5, vr4 vilvl.d vr2, vr7, vr6 vilvh.d vr3, vr7, vr6 vadd.h vr4, vr0, vr1 vsub.h vr5, vr0, vr1 vadd.h vr6, vr2, vr3 vsub.h vr7, vr2, vr3 // vr12 vr13 vr14 vr15 vpickev.w vr0, vr13, vr12 vpickod.w vr1, vr13, vr12 vpickev.w vr2, vr15, vr14 vpickod.w vr3, vr15, vr14 vadd.h vr8, vr0, vr1 vsub.h vr9, vr0, vr1 vadd.h vr10, vr2, vr3 vsub.h vr11, vr2, vr3 vadd.h vr12, vr8, vr10 vadd.h vr13, vr9, vr11 vsub.h vr14, vr8, vr10 vsub.h vr15, vr9, vr11 // vr4 vr5 vr6 vr7 vpickev.w vr0, vr5, vr4 vpickod.w vr1, vr5, vr4 vpickev.w vr2, vr7, vr6 vpickod.w vr3, vr7, vr6 vadd.h vr8, vr0, vr1 vsub.h vr9, vr0, vr1 vadd.h vr10, vr2, vr3 vsub.h vr11, vr2, vr3 vadd.h vr4, vr8, vr10 vadd.h vr5, vr9, vr11 vsub.h vr6, vr8, vr10 vsub.h vr7, vr9, vr11 vadd.h vr0, vr12, vr4 vadd.h vr1, vr13, vr5 vadd.h vr2, vr14, vr6 vadd.h vr3, vr15, vr7 vsub.h vr8, vr12, vr4 vsub.h vr9, vr13, vr5 vsub.h vr10, vr14, vr6 vsub.h vr11, vr15, vr7 vadda.h \out0, vr0, vr8 vadda.h \out1, vr1, vr9 vadda.h \out2, vr2, vr10 vadda.h \out3, vr3, vr11 .endm function_x264 pixel_sa8d_8x8_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 vadd.h vr0, vr0, vr1 vadd.h vr1, vr2, vr3 vadd.h vr17, vr0, vr1 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.du.wu vr17, vr17, vr17 vhaddw.qu.du vr17, vr17, vr17 vpickve2gr.wu t5, vr17, 0 addi.d t5, t5, 2 srli.d a0, t5, 2 endfunc_x264 /* * int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, * pixel *pix2, intptr_t i_pix2 ) */ function_x264 pixel_sa8d_16x16_lsx slli.d t0, a1, 1 add.d t1, t0, a1 slli.d t2, a3, 1 add.d t3, t2, a3 add.d t6, a0, zero add.d t7, a2, zero pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 vadd.h vr0, vr0, vr1 vadd.h vr1, vr2, vr3 vadd.h vr16, vr0, vr1 addi.d a0, t6, 8 addi.d a2, t7, 8 pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 vadd.h vr0, vr0, vr1 vadd.h vr1, vr2, vr3 vadd.h vr17, vr0, vr1 alsl.d a0, a1, t6, 3 alsl.d a2, a3, t7, 3 pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 vadd.h vr0, vr0, vr1 vadd.h vr1, vr2, vr3 vadd.h vr18, vr0, vr1 addi.d a0, a0, 8 addi.d a2, a2, 8 pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 vadd.h vr0, vr0, vr1 vadd.h vr1, vr2, vr3 vadd.h vr19, vr0, vr1 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vadd.w vr16, vr17, vr16 vadd.w vr18, vr19, vr18 vadd.w vr17, vr18, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.qu.du vr17, vr17, vr17 vpickve2gr.wu t5, vr17, 0 addi.d t5, t5, 2 srli.d a0, t5, 2 endfunc_x264 /* * uint64_t pixel_var_8x8( pixel *pix, intptr_t i_stride ) */ function_x264 pixel_var_8x8_lsx slli.d t0, a1, 1 add.d t1, a1, t0 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 alsl.d a0, a1, a0, 2 FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vhaddw.hu.bu vr2, vr0, vr0 vhaddw.hu.bu vr3, vr1, vr1 vhaddw.hu.bu vr6, vr4, vr4 vhaddw.hu.bu vr7, vr5, vr5 vadd.h vr2, vr2, vr3 vadd.h vr6, vr6, vr7 vadd.h vr2, vr2, vr6 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr2, vr2, vr2 vpickve2gr.wu t5, vr2, 0 // sum vmulwev.h.bu vr2, vr0, vr0 vmulwod.h.bu vr3, vr0, vr0 vmulwev.h.bu vr6, vr1, vr1 vmulwod.h.bu vr7, vr1, vr1 vmulwev.h.bu vr8, vr4, vr4 vmulwod.h.bu vr9, vr4, vr4 vmulwev.h.bu vr10, vr5, vr5 vmulwod.h.bu vr11, vr5, vr5 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr8, vr8, vr9 vadd.w vr10, vr10, vr11 vadd.w vr2, vr2, vr6 vadd.w vr8, vr8, vr10 vadd.w vr2, vr2, vr8 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr2, vr2, vr2 vpickve2gr.du t6, vr2, 0 // sqr slli.d t4, t6, 32 add.d a0, t4, t5 endfunc_x264 /* * uint64_t pixel_var_8x16( pixel *pix, intptr_t i_stride ) */ function_x264 pixel_var_8x16_lsx slli.d t0, a1, 1 add.d t1, a1, t0 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 alsl.d a0, a1, a0, 2 FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vhaddw.hu.bu vr2, vr0, vr0 vhaddw.hu.bu vr3, vr1, vr1 vhaddw.hu.bu vr6, vr4, vr4 vhaddw.hu.bu vr7, vr5, vr5 vadd.h vr2, vr2, vr3 vadd.h vr6, vr6, vr7 vadd.h vr16, vr2, vr6 vmulwev.h.bu vr2, vr0, vr0 vmulwod.h.bu vr3, vr0, vr0 vmulwev.h.bu vr6, vr1, vr1 vmulwod.h.bu vr7, vr1, vr1 vmulwev.h.bu vr8, vr4, vr4 vmulwod.h.bu vr9, vr4, vr4 vmulwev.h.bu vr10, vr5, vr5 vmulwod.h.bu vr11, vr5, vr5 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vadd.w vr12, vr2, vr3 vadd.w vr13, vr6, vr7 vadd.w vr14, vr8, vr9 vadd.w vr15, vr10, vr11 vadd.w vr12, vr12, vr13 vadd.w vr14, vr14, vr15 vadd.w vr12, vr12, vr14 alsl.d a0, a1, a0, 2 FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 alsl.d a0, a1, a0, 2 FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr4, vr5, vr4 vilvl.d vr5, vr7, vr6 vhaddw.hu.bu vr2, vr0, vr0 vhaddw.hu.bu vr3, vr1, vr1 vhaddw.hu.bu vr6, vr4, vr4 vhaddw.hu.bu vr7, vr5, vr5 vadd.h vr2, vr2, vr3 vadd.h vr6, vr6, vr7 vadd.h vr2, vr2, vr6 vadd.h vr2, vr2, vr16 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr2, vr2, vr2 vpickve2gr.wu t5, vr2, 0 // sum vmulwev.h.bu vr2, vr0, vr0 vmulwod.h.bu vr3, vr0, vr0 vmulwev.h.bu vr6, vr1, vr1 vmulwod.h.bu vr7, vr1, vr1 vmulwev.h.bu vr8, vr4, vr4 vmulwod.h.bu vr9, vr4, vr4 vmulwev.h.bu vr10, vr5, vr5 vmulwod.h.bu vr11, vr5, vr5 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.wu.hu vr3, vr3, vr3 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vadd.w vr2, vr2, vr3 vadd.w vr6, vr6, vr7 vadd.w vr8, vr8, vr9 vadd.w vr10, vr10, vr11 vadd.w vr2, vr2, vr6 vadd.w vr8, vr8, vr10 vadd.w vr2, vr2, vr8 vadd.w vr2, vr2, vr12 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr2, vr2, vr2 vpickve2gr.du t6, vr2, 0 // sqr slli.d t4, t6, 32 add.d a0, t4, t5 endfunc_x264 /* * uint64_t pixel_var_16x16( pixel *pix, intptr_t i_stride ) */ function_x264 pixel_var_16x16_lsx slli.d t0, a1, 1 add.d t1, t0, a1 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 vhaddw.hu.bu vr4, vr0, vr0 vhaddw.hu.bu vr5, vr1, vr1 vhaddw.hu.bu vr6, vr2, vr2 vhaddw.hu.bu vr7, vr3, vr3 vadd.h vr4, vr5, vr4 vadd.h vr5, vr7, vr6 vadd.h vr13, vr5, vr4 vmulwev.h.bu vr5, vr0, vr0 vmulwod.h.bu vr6, vr0, vr0 vmulwev.h.bu vr7, vr1, vr1 vmulwod.h.bu vr8, vr1, vr1 vmulwev.h.bu vr9, vr2, vr2 vmulwod.h.bu vr10, vr2, vr2 vmulwev.h.bu vr11, vr3, vr3 vmulwod.h.bu vr12, vr3, vr3 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.wu.hu vr12, vr12, vr12 vadd.w vr5, vr5, vr6 vadd.w vr6, vr8, vr7 vadd.w vr7, vr10, vr9 vadd.w vr8, vr12, vr11 vadd.w vr0, vr5, vr6 vadd.w vr1, vr8, vr7 vadd.w vr14, vr1, vr0 .rept 3 alsl.d a0, a1, a0, 2 LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 vhaddw.hu.bu vr4, vr0, vr0 vhaddw.hu.bu vr5, vr1, vr1 vhaddw.hu.bu vr6, vr2, vr2 vhaddw.hu.bu vr7, vr3, vr3 vadd.h vr4, vr5, vr4 vadd.h vr5, vr7, vr6 vadd.h vr4, vr5, vr4 vadd.h vr13, vr4, vr13 vmulwev.h.bu vr5, vr0, vr0 vmulwod.h.bu vr6, vr0, vr0 vmulwev.h.bu vr7, vr1, vr1 vmulwod.h.bu vr8, vr1, vr1 vmulwev.h.bu vr9, vr2, vr2 vmulwod.h.bu vr10, vr2, vr2 vmulwev.h.bu vr11, vr3, vr3 vmulwod.h.bu vr12, vr3, vr3 vhaddw.wu.hu vr5, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.wu.hu vr12, vr12, vr12 vadd.w vr5, vr5, vr6 vadd.w vr6, vr8, vr7 vadd.w vr7, vr10, vr9 vadd.w vr8, vr12, vr11 vadd.w vr0, vr5, vr6 vadd.w vr1, vr8, vr7 vadd.w vr0, vr1, vr0 vadd.w vr14, vr0, vr14 .endr vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu t4, vr13, 0 vhaddw.du.wu vr14, vr14, vr14 vhaddw.qu.du vr14, vr14, vr14 vpickve2gr.du t6, vr14, 0 // sqr slli.d t5, t6, 32 add.d a0, t4, t5 endfunc_x264 .macro sse_diff_8width_lsx in0, in1, in2, in3 fld.d f0, \in0, 0 fld.d f1, \in0, FENC_STRIDE fld.d f2, \in0, FENC_STRIDE * 2 fld.d f3, \in0, FENC_STRIDE * 3 fld.d f4, \in1, 0 fld.d f5, \in1, FDEC_STRIDE fld.d f6, \in1, FDEC_STRIDE * 2 fld.d f7, \in1, FDEC_STRIDE * 3 vilvl.d vr0, vr1, vr0 vilvl.d vr1, vr3, vr2 vilvl.d vr2, vr5, vr4 vilvl.d vr3, vr7, vr6 vsubwev.h.bu vr4, vr0, vr2 vsubwod.h.bu vr5, vr0, vr2 vsubwev.h.bu vr6, vr1, vr3 vsubwod.h.bu vr7, vr1, vr3 // sqr_u vdp2add.w.h \in2, vr4, vr4 vdp2add.w.h \in2, vr5, vr5 vdp2add.w.h \in2, vr6, vr6 vdp2add.w.h \in2, vr7, vr7 // sum_u vadd.h vr4, vr4, vr5 vadd.h vr6, vr6, vr7 vadd.h \in3, vr4, vr6 .endm /* * int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] ) */ function_x264 pixel_var2_8x8_lsx vxor.v vr8, vr8, vr8 sse_diff_8width_lsx a0, a1, vr8, vr9 addi.d t0, a0, FENC_STRIDE * 4 addi.d t1, a1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr10 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t2, vr8, 0 // sqr_u vadd.h vr8, vr10, vr9 vhaddw.w.h vr8, vr8, vr8 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t3, vr8, 0 // sum_u addi.d a0, a0, FENC_STRIDE / 2 addi.d a1, a1, FDEC_STRIDE / 2 vxor.v vr8, vr8, vr8 sse_diff_8width_lsx a0, a1, vr8, vr9 addi.d t0, a0, FENC_STRIDE * 4 addi.d t1, a1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr10 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t4, vr8, 0 // sqr_v vadd.h vr8, vr10, vr9 vhaddw.w.h vr8, vr8, vr8 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t5, vr8, 0 // sum_v st.w t2, a2, 0 st.w t4, a2, 4 mul.w t3, t3, t3 mul.w t5, t5, t5 srai.w t3, t3, 6 srai.w t5, t5, 6 sub.w t2, t2, t3 sub.w t4, t4, t5 add.w a0, t2, t4 endfunc_x264 /* * int pixel_var2_8x16( pixel *fenc, pixel *fdec, int ssd[2] ) */ function_x264 pixel_var2_8x16_lsx vxor.v vr8, vr8, vr8 sse_diff_8width_lsx a0, a1, vr8, vr9 addi.d t0, a0, FENC_STRIDE * 4 addi.d t1, a1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr10 addi.d t0, t0, FENC_STRIDE * 4 addi.d t1, t1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr11 addi.d t0, t0, FENC_STRIDE * 4 addi.d t1, t1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr12 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t2, vr8, 0 // sqr_u vadd.h vr8, vr10, vr9 vadd.h vr8, vr11, vr8 vadd.h vr8, vr12, vr8 vhaddw.w.h vr8, vr8, vr8 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t3, vr8, 0 // sum_u addi.d a0, a0, FENC_STRIDE / 2 addi.d a1, a1, FDEC_STRIDE / 2 vxor.v vr8, vr8, vr8 sse_diff_8width_lsx a0, a1, vr8, vr9 addi.d t0, a0, FENC_STRIDE * 4 addi.d t1, a1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr10 addi.d t0, t0, FENC_STRIDE * 4 addi.d t1, t1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr11 addi.d t0, t0, FENC_STRIDE * 4 addi.d t1, t1, FDEC_STRIDE * 4 sse_diff_8width_lsx t0, t1, vr8, vr12 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t4, vr8, 0 // sqr_v vadd.h vr8, vr10, vr9 vadd.h vr8, vr11, vr8 vadd.h vr8, vr12, vr8 vhaddw.w.h vr8, vr8, vr8 vhaddw.d.w vr8, vr8, vr8 vhaddw.q.d vr8, vr8, vr8 vpickve2gr.w t5, vr8, 0 // sum_v st.w t2, a2, 0 st.w t4, a2, 4 mul.w t3, t3, t3 mul.w t5, t5, t5 srai.w t3, t3, 7 srai.w t5, t5, 7 sub.w t2, t2, t3 sub.w t4, t4, t5 add.w a0, t2, t4 endfunc_x264 #endif /* !HIGH_BIT_DEPTH */ x264-master/common/loongarch/pixel-c.c000066400000000000000000000231711502133446700200340ustar00rootroot00000000000000/***************************************************************************** * pixel-c.c: loongarch pixel metrics ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Hecai Yuan * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "pixel.h" #include "predict.h" #if !HIGH_BIT_DEPTH uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride ); u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride ); u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride + 8, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride ); u_sum += x264_hadamard_ac_8x8_lasx( p_pix + ( i_stride << 3 ), i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ) { ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); x264_predict_8x8_v_lsx( pix, p_edge ); p_sad_array[0] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8_h_lsx( pix, p_edge ); p_sad_array[1] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8_dc_lsx( pix, p_edge ); p_sad_array[2] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ) { ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); x264_predict_8x8_v_lsx( pix, p_edge ); p_sad_array[0] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8_h_lasx( pix, p_edge ); p_sad_array[1] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8_dc_lsx( pix, p_edge ); p_sad_array[2] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_4x4_v_lsx( p_dec ); p_sad_array[0] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_4x4_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_4x4_dc_lsx( p_dec ); p_sad_array[2] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_16x16_v_lsx( p_dec ); p_sad_array[0] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_16x16_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_16x16_dc_lsx( p_dec ); p_sad_array[2] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_16x16_v_lsx( p_dec ); p_sad_array[0] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_16x16_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_16x16_dc_lsx( p_dec ); p_sad_array[2] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_8x8c_dc_lsx( p_dec ); p_sad_array[0] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8c_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8c_v_lsx( p_dec ); p_sad_array[2] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_4x4_v_lsx( p_dec ); p_sad_array[0] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_4x4_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_4x4_dc_lsx( p_dec ); p_sad_array[2] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_16x16_v_lsx( p_dec ); p_sad_array[0] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_16x16_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_16x16_dc_lsx( p_dec ); p_sad_array[2] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ) { ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); x264_predict_8x8_v_lsx( pix, p_edge ); p_sad_array[0] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8_h_lsx( pix, p_edge ); p_sad_array[1] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8_dc_lsx( pix, p_edge ); p_sad_array[2] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_predict_8x8c_dc_lsx( p_dec ); p_sad_array[0] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8c_h_lsx( p_dec ); p_sad_array[1] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_predict_8x8c_v_lsx( p_dec ); p_sad_array[2] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } #endif x264-master/common/loongarch/pixel.h000066400000000000000000000511501502133446700176170ustar00rootroot00000000000000/***************************************************************************** * pixel.h: loongarch pixel metrics ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Lu Wang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_LOONGARCH_PIXEL_H #define X264_LOONGARCH_PIXEL_H #define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx) int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx) int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx) int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx) int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx) int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx) int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx) int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx) int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx) int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx) int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx) int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx) int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx) int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx) int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx) int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx) void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx) void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx) void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx) void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx) void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx) void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx) void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx) void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx) void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx) void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx) void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx) void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx) void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx) void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx) void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx) void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx) void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx) void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx) void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx) void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx) int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx) int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx) int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx) int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx) int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx) int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx) int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx) int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx) int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx) uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx) uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx) uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx) uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx) uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx) uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx) uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx) uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx) uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx) uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride ); #define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx) void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx) void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx) void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx) void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx) int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx) int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx) int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx) int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx) int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx) int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx) int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx) int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx) int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx) int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx) int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx) int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx) int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2, int32_t ssd[2] ); #define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx) int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2, int32_t ssd[2] ); #define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx) uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx) uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx) uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx) int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2, int32_t ssd[2] ); #define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx) int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2, int32_t ssd[2] ); #define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx) int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx) int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx) void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ); #define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx) void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ); #define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx) int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx) int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx) void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx) void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ); #define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx) void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx) void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #endif x264-master/common/loongarch/predict-a.S000066400000000000000000001445651502133446700203360ustar00rootroot00000000000000/***************************************************************************** * predict-a.S: loongarch predict functions ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Xiwei Gu * Lu Wang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" #if !HIGH_BIT_DEPTH /**************************************************************************** * 4x4 prediction for intra luma block ****************************************************************************/ /* void x264_predict_4x4_v_c( pixel *src ) */ function_x264 predict_4x4_v_lsx ld.wu t0, a0, -FDEC_STRIDE st.w t0, a0, 0 st.w t0, a0, FDEC_STRIDE st.w t0, a0, FDEC_STRIDE * 2 st.w t0, a0, FDEC_STRIDE * 3 endfunc_x264 /* void x264_predict_4x4_h_c( pixel *src ) */ function_x264 predict_4x4_h_lsx vldrepl.b vr0, a0, -1 vldrepl.b vr1, a0, FDEC_STRIDE - 1 vldrepl.b vr2, a0, FDEC_STRIDE * 2 - 1 vldrepl.b vr3, a0, FDEC_STRIDE * 3 - 1 fst.s f0, a0, 0 fst.s f1, a0, FDEC_STRIDE fst.s f2, a0, FDEC_STRIDE * 2 fst.s f3, a0, FDEC_STRIDE * 3 endfunc_x264 /* void x264_predict_4x4_dc_c( pixel *src ) */ function_x264 predict_4x4_dc_lsx fld.s f0, a0, -FDEC_STRIDE ld.bu t0, a0, -1 ld.bu t1, a0, FDEC_STRIDE - 1 ld.bu t2, a0, FDEC_STRIDE * 2 - 1 ld.bu t3, a0, FDEC_STRIDE * 3 - 1 vhaddw.hu.bu vr1, vr0, vr0 vhaddw.wu.hu vr2, vr1, vr1 vpickve2gr.w t4, vr2, 0 add.w t0, t0, t1 add.w t0, t0, t2 add.w t0, t0, t3 add.w t0, t0, t4 addi.w t0, t0, 4 srai.w t0, t0, 3 vreplgr2vr.b vr0, t0 vstelm.w vr0, a0, 0, 0 vstelm.w vr0, a0, FDEC_STRIDE, 0 vstelm.w vr0, a0, FDEC_STRIDE * 2, 0 vstelm.w vr0, a0, FDEC_STRIDE * 3, 0 endfunc_x264 /* void predict_4x4_dc_top_c( pixel *src ) */ function_x264 predict_4x4_dc_top_lsx fld.s f0, a0, -FDEC_STRIDE vhaddw.hu.bu vr1, vr0, vr0 vhaddw.wu.hu vr2, vr1, vr1 vsrari.w vr2, vr2, 2 vreplvei.b vr3, vr2, 0 fst.s f3, a0, 0 fst.s f3, a0, FDEC_STRIDE fst.s f3, a0, FDEC_STRIDE * 2 fst.s f3, a0, FDEC_STRIDE * 3 endfunc_x264 /* void predict_4x4_dc_left_c( pixel *src ) */ function_x264 predict_4x4_dc_left_lsx ld.bu t0, a0, -1 ld.bu t1, a0, FDEC_STRIDE - 1 ld.bu t2, a0, FDEC_STRIDE * 2 - 1 ld.bu t3, a0, FDEC_STRIDE * 3 - 1 add.w t0, t0, t1 add.w t0, t0, t2 add.w t0, t0, t3 addi.w t0, t0, 2 srai.w t0, t0, 2 vreplgr2vr.b vr3, t0 fst.s f3, a0, 0 fst.s f3, a0, FDEC_STRIDE fst.s f3, a0, FDEC_STRIDE * 2 fst.s f3, a0, FDEC_STRIDE * 3 endfunc_x264 /* void predict_4x4_dc_128_c( pixel *src ) */ function_x264 predict_4x4_dc_128_lsx addi.w t0, zero, 1 slli.w t0, t0, BIT_DEPTH - 1 vreplgr2vr.b vr3, t0 fst.s f3, a0, 0 fst.s f3, a0, FDEC_STRIDE fst.s f3, a0, FDEC_STRIDE * 2 fst.s f3, a0, FDEC_STRIDE * 3 endfunc_x264 /* void predict_4x4_ddl_c( pixel *src ) */ function_x264 predict_4x4_ddl_lsx fld.d f0, a0, -FDEC_STRIDE vxor.v vr10, vr10, vr10 vilvl.b vr0, vr10, vr0 vbsrl.v vr1, vr0, 2 vbsrl.v vr2, vr0, 4 // t7 vextrins.h vr2, vr0, 0x67 vslli.h vr1, vr1, 1 vadd.h vr0, vr0, vr1 vadd.h vr2, vr0, vr2 vssrarni.bu.h vr3, vr2, 2 fst.s f3, a0, 0 vbsrl.v vr4, vr3, 1 fst.s f4, a0, FDEC_STRIDE vbsrl.v vr4, vr4, 1 fst.s f4, a0, FDEC_STRIDE * 2 vbsrl.v vr4, vr4, 1 fst.s f4, a0, FDEC_STRIDE * 3 endfunc_x264 /**************************************************************************** * 8x8 prediction for intra chroma block (4:2:0) ****************************************************************************/ /* void x264_predict_8x8c_p_lsx( pixel *src ) */ const mula .short 1, 2, 3, 4, 0, 0, 0, 0 endconst const mulb .short 0, 1, 2, 3, 4, 5, 6, 7 endconst function_x264 predict_8x8c_p_lsx la.local t0, mula fld.d f3, t0, 0 fld.s f4, a0, 4 - FDEC_STRIDE fld.s f5, a0, -1 - FDEC_STRIDE vxor.v vr0, vr0, vr0 vilvl.b vr4, vr0, vr4 vilvl.b vr5, vr0, vr5 vshuf4i.h vr5, vr5, 0x1b vsub.h vr4, vr4, vr5 vmul.h vr4, vr4, vr3 vhaddw.w.h vr4, vr4, vr4 vhaddw.d.w vr4, vr4, vr4 vpickve2gr.w t0, vr4, 0 /* H */ fld.s f6, a0, FDEC_STRIDE * 4 - 1 fld.s f7, a0, FDEC_STRIDE * 5 - 1 fld.s f8, a0, FDEC_STRIDE * 6 - 1 fld.s f9, a0, FDEC_STRIDE * 7 - 1 fld.s f10, a0, FDEC_STRIDE * 2 - 1 fld.s f11, a0, FDEC_STRIDE - 1 fld.s f12, a0, -1 fld.s f13, a0, -1 - FDEC_STRIDE vilvl.b vr6, vr7, vr6 vilvl.b vr9, vr9, vr8 vilvl.h vr6, vr9, vr6 vilvl.b vr10, vr11, vr10 vilvl.b vr12, vr13, vr12 vilvl.h vr10, vr12, vr10 vilvl.b vr6, vr0, vr6 vilvl.b vr10, vr0, vr10 vsub.h vr6, vr6, vr10 vmul.h vr6, vr6, vr3 vhaddw.w.h vr6, vr6, vr6 vhaddw.d.w vr6, vr6, vr6 vpickve2gr.w t1, vr6, 0 /* V */ ld.bu t2, a0, FDEC_STRIDE * 7 - 1 ld.bu t3, a0, 7 - FDEC_STRIDE add.w t2, t2, t3 slli.w t2, t2, 4 /* a */ slli.w t3, t0, 4 add.w t0, t0, t3 addi.w t0, t0, 16 srai.w t0, t0, 5 /* b */ slli.w t3, t1, 4 add.w t1, t1, t3 addi.w t1, t1, 16 srai.w t1, t1, 5 /* c */ add.w t3, t0, t1 slli.w t4, t3, 1 add.w t4, t4, t3 sub.w t5, t2, t4 addi.w t5, t5, 16 /* i00 */ la.local t3, mulb vld vr14, t3, 0 vreplgr2vr.h vr12, t0 vmul.h vr12, vr12, vr14 vreplgr2vr.h vr14, t5 add.w t5, t5, t1 vreplgr2vr.h vr15, t5 add.w t5, t5, t1 vreplgr2vr.h vr16, t5 add.w t5, t5, t1 vreplgr2vr.h vr17, t5 add.w t5, t5, t1 vreplgr2vr.h vr18, t5 add.w t5, t5, t1 vreplgr2vr.h vr19, t5 add.w t5, t5, t1 vreplgr2vr.h vr20, t5 add.w t5, t5, t1 vreplgr2vr.h vr21, t5 vadd.h vr14, vr12, vr14 vadd.h vr15, vr12, vr15 vadd.h vr16, vr12, vr16 vadd.h vr17, vr12, vr17 vadd.h vr18, vr12, vr18 vadd.h vr19, vr12, vr19 vadd.h vr20, vr12, vr20 vadd.h vr21, vr12, vr21 vssrani.bu.h vr14, vr14, 5 vssrani.bu.h vr15, vr15, 5 vssrani.bu.h vr16, vr16, 5 vssrani.bu.h vr17, vr17, 5 vssrani.bu.h vr18, vr18, 5 vssrani.bu.h vr19, vr19, 5 vssrani.bu.h vr20, vr20, 5 vssrani.bu.h vr21, vr21, 5 fst.d f14, a0, 0 fst.d f15, a0, FDEC_STRIDE fst.d f16, a0, FDEC_STRIDE * 2 fst.d f17, a0, FDEC_STRIDE * 3 fst.d f18, a0, FDEC_STRIDE * 4 fst.d f19, a0, FDEC_STRIDE * 5 fst.d f20, a0, FDEC_STRIDE * 6 fst.d f21, a0, FDEC_STRIDE * 7 endfunc_x264 /* void x264_predict_8x8c_v_lsx( pixel *src ) */ function_x264 predict_8x8c_v_lsx fld.d f0, a0, -FDEC_STRIDE fst.d f0, a0, 0 fst.d f0, a0, FDEC_STRIDE fst.d f0, a0, FDEC_STRIDE * 2 fst.d f0, a0, FDEC_STRIDE * 3 fst.d f0, a0, FDEC_STRIDE * 4 fst.d f0, a0, FDEC_STRIDE * 5 fst.d f0, a0, FDEC_STRIDE * 6 fst.d f0, a0, FDEC_STRIDE * 7 endfunc_x264 /* void x264_predict_8x8c_h_lsx( pixel *src ) */ function_x264 predict_8x8c_h_lsx vldrepl.b vr0, a0, -1 vldrepl.b vr1, a0, FDEC_STRIDE - 1 vldrepl.b vr2, a0, FDEC_STRIDE * 2 - 1 vldrepl.b vr3, a0, FDEC_STRIDE * 3 - 1 vldrepl.b vr4, a0, FDEC_STRIDE * 4 - 1 vldrepl.b vr5, a0, FDEC_STRIDE * 5 - 1 vldrepl.b vr6, a0, FDEC_STRIDE * 6 - 1 vldrepl.b vr7, a0, FDEC_STRIDE * 7 - 1 fst.d f0, a0, 0 fst.d f1, a0, FDEC_STRIDE fst.d f2, a0, FDEC_STRIDE * 2 fst.d f3, a0, FDEC_STRIDE * 3 fst.d f4, a0, FDEC_STRIDE * 4 fst.d f5, a0, FDEC_STRIDE * 5 fst.d f6, a0, FDEC_STRIDE * 6 fst.d f7, a0, FDEC_STRIDE * 7 endfunc_x264 /* void x264_predict_8x8c_dc_lsx( pixel *src ) */ function_x264 predict_8x8c_dc_lsx fld.s f0, a0, -FDEC_STRIDE fld.s f1, a0, 4 - FDEC_STRIDE vhaddw.hu.bu vr2, vr0, vr0 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.hu.bu vr3, vr1, vr1 vhaddw.wu.hu vr3, vr3, vr3 vpickve2gr.w t0, vr2, 0 /* s0 */ vpickve2gr.w t1, vr3, 0 /* s1 */ ld.bu t2, a0, -1 ld.bu t3, a0, FDEC_STRIDE - 1 ld.bu t4, a0, FDEC_STRIDE * 2 - 1 ld.bu t5, a0, FDEC_STRIDE * 3 - 1 add.w t2, t2, t3 add.w t2, t2, t4 add.w t2, t2, t5 /* s2 */ ld.bu t3, a0, FDEC_STRIDE * 4 - 1 ld.bu t4, a0, FDEC_STRIDE * 5 - 1 ld.bu t5, a0, FDEC_STRIDE * 6 - 1 ld.bu t6, a0, FDEC_STRIDE * 7 - 1 add.w t3, t3, t4 add.w t3, t3, t5 add.w t3, t3, t6 /* s3 */ add.w t4, t0, t2 addi.w t4, t4, 4 srai.w t4, t4, 3 /* ( s0 + s2 + 4 ) >> 3 */ addi.w t5, t1, 2 srai.w t5, t5, 2 /* ( s1 + 2 ) >> 2 */ addi.w t6, t3, 2 srai.w t6, t6, 2 /* ( s3 + 2 ) >> 2 */ add.w t7, t1, t3 addi.w t7, t7, 4 srai.w t7, t7, 3 /* ( s1 + s3 + 4 ) >> 3 */ vreplgr2vr.b vr4, t4 vreplgr2vr.b vr5, t5 vreplgr2vr.b vr6, t6 vreplgr2vr.b vr7, t7 vpackev.w vr4, vr5, vr4 vpackev.w vr6, vr7, vr6 fst.d f4, a0, 0 fst.d f4, a0, FDEC_STRIDE fst.d f4, a0, FDEC_STRIDE * 2 fst.d f4, a0, FDEC_STRIDE * 3 fst.d f6, a0, FDEC_STRIDE * 4 fst.d f6, a0, FDEC_STRIDE * 5 fst.d f6, a0, FDEC_STRIDE * 6 fst.d f6, a0, FDEC_STRIDE * 7 endfunc_x264 /* void x264_predict_8x8c_dc_128_lsx( pixel *src ) */ function_x264 predict_8x8c_dc_128_lsx ori t1, t0, 1 slli.d t1, t1, BIT_DEPTH - 1 vreplgr2vr.b vr4, t1 fst.d f4, a0, 0 fst.d f4, a0, FDEC_STRIDE fst.d f4, a0, FDEC_STRIDE * 2 fst.d f4, a0, FDEC_STRIDE * 3 fst.d f4, a0, FDEC_STRIDE * 4 fst.d f4, a0, FDEC_STRIDE * 5 fst.d f4, a0, FDEC_STRIDE * 6 fst.d f4, a0, FDEC_STRIDE * 7 endfunc_x264 /* void x264_predict_8x8c_dc_top_lsx( pixel *src ) */ function_x264 predict_8x8c_dc_top_lsx fld.s f0, a0, -FDEC_STRIDE fld.s f1, a0, 4 - FDEC_STRIDE vhaddw.hu.bu vr0, vr0, vr0 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.wu.hu vr1, vr1, vr1 vpickve2gr.w t0, vr0, 0 /* dc0 */ vpickve2gr.w t1, vr1, 0 /* dc1 */ addi.w t0, t0, 2 srai.w t0, t0, 2 addi.w t1, t1, 2 srai.w t1, t1, 2 vreplgr2vr.b vr4, t0 vreplgr2vr.b vr5, t1 vpackev.w vr4, vr5, vr4 fst.d f4, a0, 0 fst.d f4, a0, FDEC_STRIDE fst.d f4, a0, FDEC_STRIDE * 2 fst.d f4, a0, FDEC_STRIDE * 3 fst.d f4, a0, FDEC_STRIDE * 4 fst.d f4, a0, FDEC_STRIDE * 5 fst.d f4, a0, FDEC_STRIDE * 6 fst.d f4, a0, FDEC_STRIDE * 7 endfunc_x264 /* void x264_predict_8x8c_dc_left_lsx( pixel *src ) */ function_x264 predict_8x8c_dc_left_lsx ld.bu t0, a0, -1 ld.bu t1, a0, FDEC_STRIDE - 1 ld.bu t2, a0, FDEC_STRIDE * 2 - 1 ld.bu t3, a0, FDEC_STRIDE * 3 - 1 add.w t0, t0, t1 add.w t0, t0, t2 add.w t0, t0, t3 ld.bu t1, a0, FDEC_STRIDE * 4 - 1 ld.bu t2, a0, FDEC_STRIDE * 5 - 1 ld.bu t3, a0, FDEC_STRIDE * 6 - 1 ld.bu t4, a0, FDEC_STRIDE * 7 - 1 add.w t1, t1, t2 add.w t1, t1, t3 add.w t1, t1, t4 addi.w t0, t0, 2 srai.w t0, t0, 2 addi.w t1, t1, 2 srai.w t1, t1, 2 vreplgr2vr.b vr4, t0 /* ( dc0 + 2 ) >> 2 */ vreplgr2vr.b vr5, t1 /* ( dc1 + 2 ) >> 2 */ fst.d f4, a0, 0 fst.d f4, a0, FDEC_STRIDE fst.d f4, a0, FDEC_STRIDE * 2 fst.d f4, a0, FDEC_STRIDE * 3 fst.d f5, a0, FDEC_STRIDE * 4 fst.d f5, a0, FDEC_STRIDE * 5 fst.d f5, a0, FDEC_STRIDE * 6 fst.d f5, a0, FDEC_STRIDE * 7 endfunc_x264 /**************************************************************************** * 8x8 prediction for intra luma block ****************************************************************************/ /* void predict_8x8_v_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_v_lsx fld.d f0, a1, 16 fst.d f0, a0, 0 fst.d f0, a0, FDEC_STRIDE fst.d f0, a0, FDEC_STRIDE * 2 fst.d f0, a0, FDEC_STRIDE * 3 fst.d f0, a0, FDEC_STRIDE * 4 fst.d f0, a0, FDEC_STRIDE * 5 fst.d f0, a0, FDEC_STRIDE * 6 fst.d f0, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_h_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_h_lasx fld.d f0, a1, 7 xvinsve0.w xr0, xr0, 5 xvrepl128vei.b xr4, xr0, 7 xvrepl128vei.b xr3, xr0, 6 xvrepl128vei.b xr2, xr0, 5 xvrepl128vei.b xr1, xr0, 4 fst.d f4, a0, 0 fst.d f3, a0, FDEC_STRIDE fst.d f2, a0, FDEC_STRIDE * 2 fst.d f1, a0, FDEC_STRIDE * 3 xvstelm.d xr4, a0, FDEC_STRIDE * 4, 2 xvstelm.d xr3, a0, FDEC_STRIDE * 5, 2 xvstelm.d xr2, a0, FDEC_STRIDE * 6, 2 xvstelm.d xr1, a0, FDEC_STRIDE * 7, 2 endfunc_x264 function_x264 predict_8x8_h_lsx fld.d f0, a1, 7 vreplvei.w vr1, vr0, 0 vreplvei.b vr4, vr0, 7 vreplvei.b vr5, vr1, 7 vreplvei.b vr6, vr0, 6 vreplvei.b vr7, vr1, 6 vreplvei.b vr8, vr0, 5 vreplvei.b vr9, vr1, 5 vreplvei.b vr10, vr0, 4 vreplvei.b vr11, vr1, 4 fst.d f4, a0, 0 fst.d f6, a0, FDEC_STRIDE fst.d f8, a0, FDEC_STRIDE * 2 fst.d f10, a0, FDEC_STRIDE * 3 vstelm.d vr5, a0, FDEC_STRIDE * 4, 0 vstelm.d vr7, a0, FDEC_STRIDE * 5, 0 vstelm.d vr9, a0, FDEC_STRIDE * 6, 0 vstelm.d vr11, a0, FDEC_STRIDE * 7, 0 endfunc_x264 /* void predict_8x8_dc_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_dc_lsx fld.d f0, a1, 7 fld.d f1, a1, 16 vilvl.d vr0, vr1, vr0 vhaddw.hu.bu vr1, vr0, vr0 vhaddw.wu.hu vr2, vr1, vr1 vhaddw.du.wu vr3, vr2, vr2 vhaddw.qu.du vr4, vr3, vr3 vsrari.w vr4, vr4, 4 vreplvei.b vr5, vr4, 0 fst.d f5, a0, 0 fst.d f5, a0, FDEC_STRIDE fst.d f5, a0, FDEC_STRIDE * 2 fst.d f5, a0, FDEC_STRIDE * 3 fst.d f5, a0, FDEC_STRIDE * 4 fst.d f5, a0, FDEC_STRIDE * 5 fst.d f5, a0, FDEC_STRIDE * 6 fst.d f5, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_dc_left_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_dc_left_lsx fld.d f0, a1, 7 vhaddw.hu.bu vr1, vr0, vr0 vhaddw.wu.hu vr2, vr1, vr1 vhaddw.du.wu vr3, vr2, vr2 vsrari.w vr3, vr3, 3 vreplvei.b vr5, vr3, 0 fst.d f5, a0, 0 fst.d f5, a0, FDEC_STRIDE fst.d f5, a0, FDEC_STRIDE * 2 fst.d f5, a0, FDEC_STRIDE * 3 fst.d f5, a0, FDEC_STRIDE * 4 fst.d f5, a0, FDEC_STRIDE * 5 fst.d f5, a0, FDEC_STRIDE * 6 fst.d f5, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_dc_top_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_dc_top_lsx fld.d f0, a1, 16 vhaddw.hu.bu vr1, vr0, vr0 vhaddw.wu.hu vr2, vr1, vr1 vhaddw.du.wu vr3, vr2, vr2 vsrari.w vr3, vr3, 3 vreplvei.b vr5, vr3, 0 fst.d f5, a0, 0 fst.d f5, a0, FDEC_STRIDE fst.d f5, a0, FDEC_STRIDE * 2 fst.d f5, a0, FDEC_STRIDE * 3 fst.d f5, a0, FDEC_STRIDE * 4 fst.d f5, a0, FDEC_STRIDE * 5 fst.d f5, a0, FDEC_STRIDE * 6 fst.d f5, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_dc_128_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_dc_128_lsx addi.w t0, zero, 1 slli.d t1, t0, (BIT_DEPTH-1) vreplgr2vr.b vr5, t1 fst.d f5, a0, 0 fst.d f5, a0, FDEC_STRIDE fst.d f5, a0, FDEC_STRIDE * 2 fst.d f5, a0, FDEC_STRIDE * 3 fst.d f5, a0, FDEC_STRIDE * 4 fst.d f5, a0, FDEC_STRIDE * 5 fst.d f5, a0, FDEC_STRIDE * 6 fst.d f5, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_ddl_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_ddl_lasx vld vr1, a1, 16 vbsrl.v vr2, vr1, 1 vbsrl.v vr3, vr1, 2 vextrins.b vr3, vr1, 0xef vext2xv.hu.bu xr5, xr1 vext2xv.hu.bu xr6, xr2 vext2xv.hu.bu xr7, xr3 xvslli.h xr6, xr6, 1 xvadd.h xr8, xr5, xr6 xvadd.h xr9, xr8, xr7 xvssrarni.bu.h xr9, xr9, 2 xvpermi.d xr9, xr9, 0x08 vbsrl.v vr10, vr9, 1 vbsrl.v vr11, vr9, 2 vbsrl.v vr12, vr9, 3 vbsrl.v vr13, vr9, 4 vbsrl.v vr14, vr9, 5 vbsrl.v vr15, vr9, 6 vbsrl.v vr16, vr9, 7 fst.d f9, a0, 0 fst.d f10, a0, FDEC_STRIDE fst.d f11, a0, FDEC_STRIDE * 2 fst.d f12, a0, FDEC_STRIDE * 3 fst.d f13, a0, FDEC_STRIDE * 4 fst.d f14, a0, FDEC_STRIDE * 5 fst.d f15, a0, FDEC_STRIDE * 6 fst.d f16, a0, FDEC_STRIDE * 7 endfunc_x264 function_x264 predict_8x8_ddl_lsx vld vr1, a1, 16 vbsrl.v vr2, vr1, 1 vbsrl.v vr3, vr1, 2 vextrins.b vr3, vr1, 0xef vsllwil.hu.bu vr5, vr1, 0 vexth.hu.bu vr15, vr1 vsllwil.hu.bu vr6, vr2, 0 vexth.hu.bu vr16, vr2 vsllwil.hu.bu vr7, vr3, 0 vexth.hu.bu vr17, vr3 vslli.h vr6, vr6, 1 vslli.h vr16, vr16, 1 vadd.h vr8, vr5, vr6 vadd.h vr18, vr15, vr16 vadd.h vr19, vr8, vr7 vadd.h vr9, vr18, vr17 vssrarni.bu.h vr9, vr19, 2 vbsrl.v vr10, vr9, 1 vbsrl.v vr11, vr9, 2 vbsrl.v vr12, vr9, 3 vbsrl.v vr13, vr9, 4 vbsrl.v vr14, vr9, 5 vbsrl.v vr15, vr9, 6 vbsrl.v vr16, vr9, 7 fst.d f9, a0, 0 fst.d f10, a0, FDEC_STRIDE fst.d f11, a0, FDEC_STRIDE * 2 fst.d f12, a0, FDEC_STRIDE * 3 fst.d f13, a0, FDEC_STRIDE * 4 fst.d f14, a0, FDEC_STRIDE * 5 fst.d f15, a0, FDEC_STRIDE * 6 fst.d f16, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_ddr_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_ddr_lasx vld vr1, a1, 7 vbsrl.v vr2, vr1, 1 vbsrl.v vr3, vr1, 2 // edge[23] ld.bu t0, a1, 23 vinsgr2vr.b vr3, t0, 0xe vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 vext2xv.hu.bu xr3, xr3 xvslli.h xr2, xr2, 1 xvadd.h xr4, xr1, xr2 xvadd.h xr5, xr4, xr3 xvssrarni.bu.h xr5, xr5, 2 xvpermi.d xr6, xr5, 0x08 vbsrl.v vr7, vr6, 7 vbsrl.v vr8, vr6, 6 vbsrl.v vr9, vr6, 5 vbsrl.v vr10, vr6, 4 vbsrl.v vr11, vr6, 3 vbsrl.v vr12, vr6, 2 vbsrl.v vr13, vr6, 1 fst.d f7, a0, 0 fst.d f8, a0, FDEC_STRIDE fst.d f9, a0, FDEC_STRIDE * 2 fst.d f10, a0, FDEC_STRIDE * 3 fst.d f11, a0, FDEC_STRIDE * 4 fst.d f12, a0, FDEC_STRIDE * 5 fst.d f13, a0, FDEC_STRIDE * 6 fst.d f6, a0, FDEC_STRIDE * 7 endfunc_x264 function_x264 predict_8x8_ddr_lsx vld vr1, a1, 7 vbsrl.v vr2, vr1, 1 vbsrl.v vr3, vr1, 2 // edge[23] ld.bu t0, a1, 23 vinsgr2vr.b vr3, t0, 0xe vexth.hu.bu vr11, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr12, vr2 vsllwil.hu.bu vr2, vr2, 0 vexth.hu.bu vr13, vr3 vsllwil.hu.bu vr3, vr3, 0 vslli.h vr2, vr2, 1 vslli.h vr12, vr12, 1 vadd.h vr4, vr1, vr2 vadd.h vr14, vr11, vr12 vadd.h vr5, vr4, vr3 vadd.h vr15, vr14, vr13 vssrarni.bu.h vr15, vr5, 2 vbsrl.v vr7, vr15, 7 vbsrl.v vr8, vr15, 6 vbsrl.v vr9, vr15, 5 vbsrl.v vr10, vr15, 4 vbsrl.v vr11, vr15, 3 vbsrl.v vr12, vr15, 2 vbsrl.v vr13, vr15, 1 fst.d f7, a0, 0 fst.d f8, a0, FDEC_STRIDE fst.d f9, a0, FDEC_STRIDE * 2 fst.d f10, a0, FDEC_STRIDE * 3 fst.d f11, a0, FDEC_STRIDE * 4 fst.d f12, a0, FDEC_STRIDE * 5 fst.d f13, a0, FDEC_STRIDE * 6 fst.d f15, a0, FDEC_STRIDE * 7 endfunc_x264 /* void predict_8x8_vr_c( pixel *src, pixel edge[36] ) */ function_x264 predict_8x8_vr_lasx vld vr0, a1, 8 vbsrl.v vr1, vr0, 1 vbsrl.v vr2, vr0, 2 vext2xv.hu.bu xr5, xr0 vext2xv.hu.bu xr6, xr1 vext2xv.hu.bu xr7, xr2 xvadd.h xr10, xr5, xr6 xvadd.h xr11, xr10, xr6 xvadd.h xr12, xr11, xr7 xvssrarni.bu.h xr12, xr12, 2 xvssrarni.bu.h xr10, xr10, 1 xvpermi.d xr13, xr12, 0x08 xvpermi.d xr14, xr10, 0x08 vbsrl.v vr15, vr13, 6 vbsll.v vr16, vr15, 1 vextrins.b vr16, vr13, 0x04 vbsll.v vr17, vr16, 1 vextrins.b vr17, vr13, 0x02 vbsll.v vr18, vr17, 1 vextrins.b vr18, vr13, 0x00 fst.d f15, a0, FDEC_STRIDE fst.d f16, a0, FDEC_STRIDE * 3 fst.d f17, a0, FDEC_STRIDE * 5 fst.d f18, a0, FDEC_STRIDE * 7 vbsrl.v vr16, vr14, 7 vbsll.v vr17, vr16, 1 vextrins.b vr17, vr13, 0x05 vbsll.v vr18, vr17, 1 vextrins.b vr18, vr13, 0x03 vbsll.v vr19, vr18, 1 vextrins.b vr19, vr13, 0x01 fst.d f16, a0, 0 fst.d f17, a0, FDEC_STRIDE * 2 fst.d f18, a0, FDEC_STRIDE * 4 fst.d f19, a0, FDEC_STRIDE * 6 endfunc_x264 function_x264 predict_8x8_vr_lsx vld vr0, a1, 8 vbsrl.v vr1, vr0, 1 vbsrl.v vr2, vr0, 2 vexth.hu.bu vr5, vr0 vsllwil.hu.bu vr0, vr0, 0 vexth.hu.bu vr6, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr7, vr2 vsllwil.hu.bu vr2, vr2, 0 vadd.h vr9, vr0, vr1 vadd.h vr10, vr5, vr6 vadd.h vr11, vr9, vr1 vadd.h vr12, vr10, vr6 vadd.h vr13, vr11, vr2 vadd.h vr14, vr12, vr7 vssrarni.bu.h vr14, vr13, 2 vssrarni.bu.h vr10, vr9, 1 vbsrl.v vr15, vr14, 6 vbsll.v vr16, vr15, 1 vextrins.b vr16, vr14, 0x04 vbsll.v vr17, vr16, 1 vextrins.b vr17, vr14, 0x02 vbsll.v vr18, vr17, 1 vextrins.b vr18, vr14, 0x00 fst.d f15, a0, FDEC_STRIDE fst.d f16, a0, FDEC_STRIDE * 3 fst.d f17, a0, FDEC_STRIDE * 5 fst.d f18, a0, FDEC_STRIDE * 7 vbsrl.v vr16, vr10, 7 vbsll.v vr17, vr16, 1 vextrins.b vr17, vr14, 0x05 vbsll.v vr18, vr17, 1 vextrins.b vr18, vr14, 0x03 vbsll.v vr19, vr18, 1 vextrins.b vr19, vr14, 0x01 fst.d f16, a0, 0 fst.d f17, a0, FDEC_STRIDE * 2 fst.d f18, a0, FDEC_STRIDE * 4 fst.d f19, a0, FDEC_STRIDE * 6 endfunc_x264 /* void predict_8x8_vl_c( pixel *src, pixel edge[36] ); */ function_x264 predict_8x8_vl_lasx vld vr0, a1, 16 vbsrl.v vr1, vr0, 1 vbsrl.v vr2, vr0, 2 vext2xv.hu.bu xr0, xr0 vext2xv.hu.bu xr1, xr1 vext2xv.hu.bu xr2, xr2 xvadd.h xr3, xr0, xr1 xvadd.h xr4, xr3, xr1 xvadd.h xr5, xr4, xr2 xvssrarni.bu.h xr3, xr3, 1 xvssrarni.bu.h xr5, xr5, 2 xvpermi.d xr6, xr3, 0x8 xvpermi.d xr7, xr5, 0x8 vbsrl.v vr8, vr6, 1 vbsrl.v vr9, vr7, 1 fst.d f6, a0, 0 fst.d f7, a0, FDEC_STRIDE fst.d f8, a0, FDEC_STRIDE * 2 fst.d f9, a0, FDEC_STRIDE * 3 vbsrl.v vr10, vr8, 1 vbsrl.v vr11, vr9, 1 vbsrl.v vr12, vr10, 1 vbsrl.v vr13, vr11, 1 fst.d f10, a0, FDEC_STRIDE * 4 fst.d f11, a0, FDEC_STRIDE * 5 fst.d f12, a0, FDEC_STRIDE * 6 fst.d f13, a0, FDEC_STRIDE * 7 endfunc_x264 function_x264 predict_8x8_vl_lsx vld vr0, a1, 16 vbsrl.v vr1, vr0, 1 vbsrl.v vr2, vr0, 2 vexth.hu.bu vr5, vr0 vsllwil.hu.bu vr0, vr0, 0 vexth.hu.bu vr6, vr1 vsllwil.hu.bu vr1, vr1, 0 vexth.hu.bu vr7, vr2 vsllwil.hu.bu vr2, vr2, 0 vadd.h vr3, vr0, vr1 vadd.h vr13, vr5, vr6 vadd.h vr4, vr3, vr1 vadd.h vr14, vr13, vr6 vadd.h vr5, vr4, vr2 vadd.h vr15, vr14, vr7 vssrarni.bu.h vr13, vr3, 1 vssrarni.bu.h vr15, vr5, 2 vbsrl.v vr8, vr13, 1 vbsrl.v vr9, vr15, 1 fst.d f13, a0, 0 fst.d f15, a0, FDEC_STRIDE fst.d f8, a0, FDEC_STRIDE * 2 fst.d f9, a0, FDEC_STRIDE * 3 vbsrl.v vr8, vr8, 1 vbsrl.v vr9, vr9, 1 vbsrl.v vr10, vr8, 1 vbsrl.v vr11, vr9, 1 fst.d f8, a0, FDEC_STRIDE * 4 fst.d f9, a0, FDEC_STRIDE * 5 fst.d f10, a0, FDEC_STRIDE * 6 fst.d f11, a0, FDEC_STRIDE * 7 endfunc_x264 /**************************************************************************** * 16x16 prediction for intra luma block ****************************************************************************/ /* void x264_predict_16x16_dc_lsx( pixel *src ) */ function_x264 predict_16x16_dc_lsx ld.bu t4, a0, -1 ld.bu t5, a0, FDEC_STRIDE - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 2 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 3 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 4 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 5 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 6 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 7 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 8 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 9 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 10 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 11 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 12 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 13 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 14 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 15 - 1 add.d t4, t4, t5 vld vr4, a0, -FDEC_STRIDE vhaddw.hu.bu vr4, vr4, vr4 vhaddw.wu.hu vr4, vr4, vr4 vhaddw.du.wu vr4, vr4, vr4 vhaddw.qu.du vr4, vr4, vr4 vpickve2gr.wu t5, vr4, 0 add.d t4, t4, t5 addi.d t5, t4, 16 srai.w t5, t5, 5 vreplgr2vr.b vr5, t5 vst vr5, a0, 0 vst vr5, a0, FDEC_STRIDE vst vr5, a0, FDEC_STRIDE * 2 vst vr5, a0, FDEC_STRIDE * 3 vst vr5, a0, FDEC_STRIDE * 4 vst vr5, a0, FDEC_STRIDE * 5 vst vr5, a0, FDEC_STRIDE * 6 vst vr5, a0, FDEC_STRIDE * 7 vst vr5, a0, FDEC_STRIDE * 8 vst vr5, a0, FDEC_STRIDE * 9 vst vr5, a0, FDEC_STRIDE * 10 vst vr5, a0, FDEC_STRIDE * 11 vst vr5, a0, FDEC_STRIDE * 12 vst vr5, a0, FDEC_STRIDE * 13 vst vr5, a0, FDEC_STRIDE * 14 vst vr5, a0, FDEC_STRIDE * 15 endfunc_x264 /* void x264_predict_16x16_dc_left_lsx( pixel *src ) */ function_x264 predict_16x16_dc_left_lsx ld.bu t4, a0, -1 ld.bu t5, a0, FDEC_STRIDE - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 2 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 3 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 4 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 5 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 6 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 7 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 8 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 9 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 10 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 11 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 12 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 13 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 14 - 1 add.d t4, t4, t5 ld.bu t5, a0, FDEC_STRIDE * 15 - 1 add.d t4, t4, t5 addi.d t5, t4, 8 srai.w t5, t5, 4 vreplgr2vr.b vr5, t5 vst vr5, a0, 0 vst vr5, a0, FDEC_STRIDE vst vr5, a0, FDEC_STRIDE * 2 vst vr5, a0, FDEC_STRIDE * 3 vst vr5, a0, FDEC_STRIDE * 4 vst vr5, a0, FDEC_STRIDE * 5 vst vr5, a0, FDEC_STRIDE * 6 vst vr5, a0, FDEC_STRIDE * 7 vst vr5, a0, FDEC_STRIDE * 8 vst vr5, a0, FDEC_STRIDE * 9 vst vr5, a0, FDEC_STRIDE * 10 vst vr5, a0, FDEC_STRIDE * 11 vst vr5, a0, FDEC_STRIDE * 12 vst vr5, a0, FDEC_STRIDE * 13 vst vr5, a0, FDEC_STRIDE * 14 vst vr5, a0, FDEC_STRIDE * 15 endfunc_x264 /* void x264_predict_16x16_dc_top_lsx( pixel *src ) */ function_x264 predict_16x16_dc_top_lsx vld vr4, a0, -FDEC_STRIDE vhaddw.hu.bu vr4, vr4, vr4 vhaddw.wu.hu vr4, vr4, vr4 vhaddw.du.wu vr4, vr4, vr4 vhaddw.qu.du vr4, vr4, vr4 vpickve2gr.wu t5, vr4, 0 addi.d t5, t5, 8 srai.w t5, t5, 4 vreplgr2vr.b vr5, t5 vst vr5, a0, 0 vst vr5, a0, FDEC_STRIDE vst vr5, a0, FDEC_STRIDE * 2 vst vr5, a0, FDEC_STRIDE * 3 vst vr5, a0, FDEC_STRIDE * 4 vst vr5, a0, FDEC_STRIDE * 5 vst vr5, a0, FDEC_STRIDE * 6 vst vr5, a0, FDEC_STRIDE * 7 vst vr5, a0, FDEC_STRIDE * 8 vst vr5, a0, FDEC_STRIDE * 9 vst vr5, a0, FDEC_STRIDE * 10 vst vr5, a0, FDEC_STRIDE * 11 vst vr5, a0, FDEC_STRIDE * 12 vst vr5, a0, FDEC_STRIDE * 13 vst vr5, a0, FDEC_STRIDE * 14 vst vr5, a0, FDEC_STRIDE * 15 endfunc_x264 /* void x264_predict_16x16_dc_128_lsx( pixel *src ) */ function_x264 predict_16x16_dc_128_lsx ori t1, t0, 1 slli.d t1, t1, BIT_DEPTH - 1 vreplgr2vr.b vr5, t1 vst vr5, a0, 0 vst vr5, a0, FDEC_STRIDE vst vr5, a0, FDEC_STRIDE * 2 vst vr5, a0, FDEC_STRIDE * 3 vst vr5, a0, FDEC_STRIDE * 4 vst vr5, a0, FDEC_STRIDE * 5 vst vr5, a0, FDEC_STRIDE * 6 vst vr5, a0, FDEC_STRIDE * 7 vst vr5, a0, FDEC_STRIDE * 8 vst vr5, a0, FDEC_STRIDE * 9 vst vr5, a0, FDEC_STRIDE * 10 vst vr5, a0, FDEC_STRIDE * 11 vst vr5, a0, FDEC_STRIDE * 12 vst vr5, a0, FDEC_STRIDE * 13 vst vr5, a0, FDEC_STRIDE * 14 vst vr5, a0, FDEC_STRIDE * 15 endfunc_x264 /* void x264_predict_16x16_h_lsx( pixel *src ) */ function_x264 predict_16x16_h_lsx ld.bu t0, a0, -1 ld.bu t1, a0, FDEC_STRIDE - 1 ld.bu t2, a0, FDEC_STRIDE * 2 - 1 ld.bu t3, a0, FDEC_STRIDE * 3 - 1 ld.bu t4, a0, FDEC_STRIDE * 4 - 1 ld.bu t5, a0, FDEC_STRIDE * 5 - 1 ld.bu t6, a0, FDEC_STRIDE * 6 - 1 ld.bu t7, a0, FDEC_STRIDE * 7 - 1 vreplgr2vr.b vr0, t0 vreplgr2vr.b vr1, t1 vreplgr2vr.b vr2, t2 vreplgr2vr.b vr3, t3 vreplgr2vr.b vr4, t4 vreplgr2vr.b vr5, t5 vreplgr2vr.b vr6, t6 vreplgr2vr.b vr7, t7 vst vr0, a0, 0 vst vr1, a0, FDEC_STRIDE vst vr2, a0, FDEC_STRIDE * 2 vst vr3, a0, FDEC_STRIDE * 3 vst vr4, a0, FDEC_STRIDE * 4 vst vr5, a0, FDEC_STRIDE * 5 vst vr6, a0, FDEC_STRIDE * 6 vst vr7, a0, FDEC_STRIDE * 7 ld.bu t0, a0, FDEC_STRIDE * 8 - 1 ld.bu t1, a0, FDEC_STRIDE * 9 - 1 ld.bu t2, a0, FDEC_STRIDE * 10 - 1 ld.bu t3, a0, FDEC_STRIDE * 11 - 1 ld.bu t4, a0, FDEC_STRIDE * 12 - 1 ld.bu t5, a0, FDEC_STRIDE * 13 - 1 ld.bu t6, a0, FDEC_STRIDE * 14 - 1 ld.bu t7, a0, FDEC_STRIDE * 15 - 1 vreplgr2vr.b vr0, t0 vreplgr2vr.b vr1, t1 vreplgr2vr.b vr2, t2 vreplgr2vr.b vr3, t3 vreplgr2vr.b vr4, t4 vreplgr2vr.b vr5, t5 vreplgr2vr.b vr6, t6 vreplgr2vr.b vr7, t7 vst vr0, a0, FDEC_STRIDE * 8 vst vr1, a0, FDEC_STRIDE * 9 vst vr2, a0, FDEC_STRIDE * 10 vst vr3, a0, FDEC_STRIDE * 11 vst vr4, a0, FDEC_STRIDE * 12 vst vr5, a0, FDEC_STRIDE * 13 vst vr6, a0, FDEC_STRIDE * 14 vst vr7, a0, FDEC_STRIDE * 15 endfunc_x264 /* void x264_predict_16x16_v_lsx( pixel *src ) */ function_x264 predict_16x16_v_lsx fld.d f4, a0, -FDEC_STRIDE fld.d f5, a0, 4 - FDEC_STRIDE fld.d f6, a0, 8 - FDEC_STRIDE fld.d f7, a0, 12 - FDEC_STRIDE vilvl.w vr4, vr5, vr4 vilvl.w vr6, vr7, vr6 vilvl.d vr4, vr6, vr4 vst vr4, a0, 0 vst vr4, a0, FDEC_STRIDE vst vr4, a0, FDEC_STRIDE * 2 vst vr4, a0, FDEC_STRIDE * 3 vst vr4, a0, FDEC_STRIDE * 4 vst vr4, a0, FDEC_STRIDE * 5 vst vr4, a0, FDEC_STRIDE * 6 vst vr4, a0, FDEC_STRIDE * 7 vst vr4, a0, FDEC_STRIDE * 8 vst vr4, a0, FDEC_STRIDE * 9 vst vr4, a0, FDEC_STRIDE * 10 vst vr4, a0, FDEC_STRIDE * 11 vst vr4, a0, FDEC_STRIDE * 12 vst vr4, a0, FDEC_STRIDE * 13 vst vr4, a0, FDEC_STRIDE * 14 vst vr4, a0, FDEC_STRIDE * 15 endfunc_x264 /* void x264_predict_16x16_p_lasx( pixel *src ) */ const mulc .short 1, 2, 3, 4, 5, 6, 7, 8 endconst const muld .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 endconst function_x264 predict_16x16_p_lasx la.local t0, mulc vld vr3, t0, 0 fld.d f4, a0, 8 - FDEC_STRIDE fld.d f5, a0, -1 - FDEC_STRIDE vxor.v vr0, vr0, vr0 vilvl.b vr4, vr0, vr4 vilvl.b vr5, vr0, vr5 vshuf4i.h vr5, vr5, 0x1b vbsll.v vr6, vr5, 8 vpackod.d vr5, vr6, vr5 vsub.h vr4, vr4, vr5 vmul.h vr4, vr4, vr3 vhaddw.w.h vr4, vr4, vr4 vhaddw.d.w vr4, vr4, vr4 vhaddw.q.d vr4, vr4, vr4 vpickve2gr.w t0, vr4, 0 /* H */ fld.d f6, a0, FDEC_STRIDE * 8 - 1 fld.d f7, a0, FDEC_STRIDE * 9 - 1 fld.d f8, a0, FDEC_STRIDE * 10 - 1 fld.d f9, a0, FDEC_STRIDE * 11 - 1 fld.d f10, a0, FDEC_STRIDE * 12 - 1 fld.d f11, a0, FDEC_STRIDE * 13 - 1 fld.d f12, a0, FDEC_STRIDE * 14 - 1 fld.d f13, a0, FDEC_STRIDE * 15 - 1 vilvl.b vr6, vr7, vr6 vilvl.b vr8, vr9, vr8 vilvl.b vr10, vr11, vr10 vilvl.b vr12, vr13, vr12 vilvl.h vr6, vr8, vr6 vilvl.h vr10, vr12, vr10 vilvl.w vr6, vr10, vr6 fld.d f7, a0, FDEC_STRIDE * 6 - 1 fld.d f8, a0, FDEC_STRIDE * 5 - 1 fld.d f9, a0, FDEC_STRIDE * 4 - 1 fld.d f10, a0, FDEC_STRIDE * 3 - 1 fld.d f11, a0, FDEC_STRIDE * 2 - 1 fld.d f12, a0, FDEC_STRIDE - 1 fld.d f13, a0, -1 fld.d f14, a0, -FDEC_STRIDE - 1 vilvl.b vr7, vr8, vr7 vilvl.b vr9, vr10, vr9 vilvl.b vr11, vr12, vr11 vilvl.b vr13, vr14, vr13 vilvl.h vr7, vr9, vr7 vilvl.h vr11, vr13, vr11 vilvl.w vr7, vr11, vr7 vilvl.b vr6, vr0, vr6 vilvl.b vr7, vr0, vr7 vsub.h vr6, vr6, vr7 vmul.h vr6, vr6, vr3 vhaddw.w.h vr6, vr6, vr6 vhaddw.d.w vr6, vr6, vr6 vhaddw.q.d vr6, vr6, vr6 vpickve2gr.w t1, vr6, 0 /* V */ ld.bu t2, a0, FDEC_STRIDE * 15 - 1 ld.bu t3, a0, 15 - FDEC_STRIDE add.w t2, t2, t3 slli.w t2, t2, 4 /* a */ slli.w t3, t0, 2 add.w t0, t0, t3 addi.w t0, t0, 32 srai.w t0, t0, 6 /* b */ slli.w t3, t1, 2 add.w t1, t1, t3 addi.w t1, t1, 32 srai.w t1, t1, 6 /* c */ add.w t3, t0, t1 slli.w t4, t3, 3 sub.w t4, t4, t3 sub.w t5, t2, t4 addi.w t5, t5, 16 /* i00 */ la.local t3, muld xvld xr14, t3, 0 xvreplgr2vr.h xr12, t0 xvmul.h xr12, xr12, xr14 .rept 16 xvreplgr2vr.h xr14, t5 xvadd.h xr13, xr12, xr14 xvssrani.bu.h xr15, xr13, 5 xvstelm.d xr15, a0, 0, 0 xvstelm.d xr15, a0, 8, 2 addi.d a0, a0, FDEC_STRIDE add.w t5, t5, t1 .endr endfunc_x264 function_x264 predict_16x16_p_lsx la.local t0, mulc vld vr3, t0, 0 fld.d f4, a0, 8 - FDEC_STRIDE fld.d f5, a0, -1 - FDEC_STRIDE vxor.v vr0, vr0, vr0 vilvl.b vr4, vr0, vr4 vilvl.b vr5, vr0, vr5 vshuf4i.h vr5, vr5, 0x1b vbsll.v vr6, vr5, 8 vpackod.d vr5, vr6, vr5 vsub.h vr4, vr4, vr5 vmul.h vr4, vr4, vr3 vhaddw.w.h vr4, vr4, vr4 vhaddw.d.w vr4, vr4, vr4 vhaddw.q.d vr4, vr4, vr4 vpickve2gr.w t0, vr4, 0 /* H */ fld.d f6, a0, FDEC_STRIDE * 8 - 1 fld.d f7, a0, FDEC_STRIDE * 9 - 1 fld.d f8, a0, FDEC_STRIDE * 10 - 1 fld.d f9, a0, FDEC_STRIDE * 11 - 1 fld.d f10, a0, FDEC_STRIDE * 12 - 1 fld.d f11, a0, FDEC_STRIDE * 13 - 1 fld.d f12, a0, FDEC_STRIDE * 14 - 1 fld.d f13, a0, FDEC_STRIDE * 15 - 1 vilvl.b vr6, vr7, vr6 vilvl.b vr8, vr9, vr8 vilvl.b vr10, vr11, vr10 vilvl.b vr12, vr13, vr12 vilvl.h vr6, vr8, vr6 vilvl.h vr10, vr12, vr10 vilvl.w vr6, vr10, vr6 fld.d f7, a0, FDEC_STRIDE * 6 - 1 fld.d f8, a0, FDEC_STRIDE * 5 - 1 fld.d f9, a0, FDEC_STRIDE * 4 - 1 fld.d f10, a0, FDEC_STRIDE * 3 - 1 fld.d f11, a0, FDEC_STRIDE * 2 - 1 fld.d f12, a0, FDEC_STRIDE - 1 fld.d f13, a0, -1 fld.d f14, a0, -FDEC_STRIDE - 1 vilvl.b vr7, vr8, vr7 vilvl.b vr9, vr10, vr9 vilvl.b vr11, vr12, vr11 vilvl.b vr13, vr14, vr13 vilvl.h vr7, vr9, vr7 vilvl.h vr11, vr13, vr11 vilvl.w vr7, vr11, vr7 vilvl.b vr6, vr0, vr6 vilvl.b vr7, vr0, vr7 vsub.h vr6, vr6, vr7 vmul.h vr6, vr6, vr3 vhaddw.w.h vr6, vr6, vr6 vhaddw.d.w vr6, vr6, vr6 vhaddw.q.d vr6, vr6, vr6 vpickve2gr.w t1, vr6, 0 /* V */ ld.bu t2, a0, FDEC_STRIDE * 15 - 1 ld.bu t3, a0, 15 - FDEC_STRIDE add.w t2, t2, t3 slli.w t2, t2, 4 /* a */ slli.w t3, t0, 2 add.w t0, t0, t3 addi.w t0, t0, 32 srai.w t0, t0, 6 /* b */ slli.w t3, t1, 2 add.w t1, t1, t3 addi.w t1, t1, 32 srai.w t1, t1, 6 /* c */ add.w t3, t0, t1 slli.w t4, t3, 3 sub.w t4, t4, t3 sub.w t5, t2, t4 addi.w t5, t5, 16 /* i00 */ la.local t3, muld vld vr14, t3, 0 vld vr20, t3, 16 vreplgr2vr.h vr12, t0 vmul.h vr22, vr12, vr14 vmul.h vr23, vr12, vr20 .rept 16 vreplgr2vr.h vr14, t5 vadd.h vr13, vr22, vr14 vadd.h vr16, vr23, vr14 vssrani.bu.h vr15, vr13, 5 vssrani.bu.h vr17, vr16, 5 vpermi.w vr17, vr15, 0x44 vst vr17, a0, 0 addi.d a0, a0, FDEC_STRIDE add.w t5, t5, t1 .endr endfunc_x264 #endif /* !HIGH_BIT_DEPT H */ x264-master/common/loongarch/predict-c.c000066400000000000000000000101331502133446700203370ustar00rootroot00000000000000/***************************************************************************** * predict-c.c: loongarch intra prediction ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "predict.h" void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] ) { #if !HIGH_BIT_DEPTH if( cpu&X264_CPU_LSX ) { pf[I_PRED_16x16_V ] = x264_predict_16x16_v_lsx; pf[I_PRED_16x16_H ] = x264_predict_16x16_h_lsx; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_lsx; pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_lsx; pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_lsx; pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_lsx; pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lsx; } if( cpu&X264_CPU_LASX ) { pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lasx; } #endif } void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] ) { #if !HIGH_BIT_DEPTH if( cpu&X264_CPU_LSX ) { pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_lsx; pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_lsx; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_lsx; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_lsx; pf[I_PRED_CHROMA_DC_128] = x264_predict_8x8c_dc_128_lsx; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_lsx; pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_lsx; } #endif } void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) { #if !HIGH_BIT_DEPTH if( cpu&X264_CPU_LSX ) { pf[I_PRED_8x8_V] = x264_predict_8x8_v_lsx; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_lsx; pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_lsx; pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_lsx; pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_lsx; pf[I_PRED_8x8_H] = x264_predict_8x8_h_lsx; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lsx; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lsx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lsx; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lsx; } if( cpu&X264_CPU_LASX ) { pf[I_PRED_8x8_H] = x264_predict_8x8_h_lasx; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lasx; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lasx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lasx; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lasx; } #endif } void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] ) { #if !HIGH_BIT_DEPTH if( cpu&X264_CPU_LSX ) { pf[I_PRED_4x4_V] = x264_predict_4x4_v_lsx; pf[I_PRED_4x4_H] = x264_predict_4x4_h_lsx; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_lsx; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_lsx; pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_lsx; pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_lsx; pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_lsx; } #endif } x264-master/common/loongarch/predict.h000066400000000000000000000146711502133446700201370ustar00rootroot00000000000000/***************************************************************************** * predict.h: loongarch intra prediction ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_LOONGARCH_PREDICT_H #define X264_LOONGARCH_PREDICT_H #define x264_predict_8x8c_p_lsx x264_template(predict_8x8c_p_lsx) void x264_predict_8x8c_p_lsx(uint8_t *p_src); #define x264_predict_8x8c_v_lsx x264_template(predict_8x8c_v_lsx) void x264_predict_8x8c_v_lsx(uint8_t *p_src); #define x264_predict_8x8c_h_lsx x264_template(predict_8x8c_h_lsx) void x264_predict_8x8c_h_lsx(uint8_t *p_src); #define x264_predict_8x8c_dc_lsx x264_template(predict_8x8c_dc_lsx) void x264_predict_8x8c_dc_lsx(pixel *src); #define x264_predict_8x8c_dc_128_lsx x264_template(predict_8x8c_dc_128_lsx) void x264_predict_8x8c_dc_128_lsx(pixel *src); #define x264_predict_8x8c_dc_top_lsx x264_template(predict_8x8c_dc_top_lsx) void x264_predict_8x8c_dc_top_lsx(pixel *src); #define x264_predict_8x8c_dc_left_lsx x264_template(predict_8x8c_dc_left_lsx) void x264_predict_8x8c_dc_left_lsx(pixel *src); #define x264_predict_16x16_dc_lsx x264_template(predict_16x16_dc_lsx) void x264_predict_16x16_dc_lsx( pixel *src ); #define x264_predict_16x16_dc_left_lsx x264_template(predict_16x16_dc_left_lsx) void x264_predict_16x16_dc_left_lsx( pixel *src ); #define x264_predict_16x16_dc_top_lsx x264_template(predict_16x16_dc_top_lsx) void x264_predict_16x16_dc_top_lsx( pixel *src ); #define x264_predict_16x16_dc_128_lsx x264_template(predict_16x16_dc_128_lsx) void x264_predict_16x16_dc_128_lsx( pixel *src ); #define x264_predict_16x16_h_lsx x264_template(predict_16x16_h_lsx) void x264_predict_16x16_h_lsx( pixel *src ); #define x264_predict_16x16_v_lsx x264_template(predict_16x16_v_lsx) void x264_predict_16x16_v_lsx( pixel *src ); #define x264_predict_16x16_p_lasx x264_template(predict_16x16_p_lasx) void x264_predict_16x16_p_lasx( pixel *src ); #define x264_predict_16x16_p_lsx x264_template(predict_16x16_p_lsx) void x264_predict_16x16_p_lsx( pixel *src ); #define x264_predict_8x8_v_lsx x264_template(predict_8x8_v_lsx) void x264_predict_8x8_v_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_h_lasx x264_template(predict_8x8_h_lasx) void x264_predict_8x8_h_lasx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_h_lsx x264_template(predict_8x8_h_lsx) void x264_predict_8x8_h_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_dc_lsx x264_template(predict_8x8_dc_lsx) void x264_predict_8x8_dc_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_dc_left_lsx x264_template(predict_8x8_dc_left_lsx) void x264_predict_8x8_dc_left_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_dc_top_lsx x264_template(predict_8x8_dc_top_lsx) void x264_predict_8x8_dc_top_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_dc_128_lsx x264_template(predict_8x8_dc_128_lsx) void x264_predict_8x8_dc_128_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddl_lasx x264_template(predict_8x8_ddl_lasx) void x264_predict_8x8_ddl_lasx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddl_lsx x264_template(predict_8x8_ddl_lsx) void x264_predict_8x8_ddl_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddr_lasx x264_template(predict_8x8_ddr_lasx) void x264_predict_8x8_ddr_lasx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddr_lsx x264_template(predict_8x8_ddr_lsx) void x264_predict_8x8_ddr_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vr_lasx x264_template(predict_8x8_vr_lasx) void x264_predict_8x8_vr_lasx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vr_lsx x264_template(predict_8x8_vr_lsx) void x264_predict_8x8_vr_lsx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vl_lasx x264_template(predict_8x8_vl_lasx) void x264_predict_8x8_vl_lasx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vl_lsx x264_template(predict_8x8_vl_lsx) void x264_predict_8x8_vl_lsx( pixel *src, pixel edge[36] ); #define x264_predict_4x4_v_lsx x264_template(predict_4x4_v_lsx) void x264_predict_4x4_v_lsx( pixel *p_src ); #define x264_predict_4x4_h_lsx x264_template(predict_4x4_h_lsx) void x264_predict_4x4_h_lsx( pixel *p_src ); #define x264_predict_4x4_dc_lsx x264_template(predict_4x4_dc_lsx) void x264_predict_4x4_dc_lsx( pixel *p_src ); #define x264_predict_4x4_ddl_lsx x264_template(predict_4x4_ddl_lsx) void x264_predict_4x4_ddl_lsx( pixel *p_src ); #define x264_predict_4x4_dc_top_lsx x264_template(predict_4x4_dc_top_lsx) void x264_predict_4x4_dc_top_lsx( pixel *p_src ); #define x264_predict_4x4_dc_left_lsx x264_template(predict_4x4_dc_left_lsx) void x264_predict_4x4_dc_left_lsx( pixel *p_src ); #define x264_predict_4x4_dc_128_lsx x264_template(predict_4x4_dc_128_lsx) void x264_predict_4x4_dc_128_lsx( pixel *p_src ); #define x264_predict_4x4_init_loongarch x264_template(predict_4x4_init_loongarch) void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] ); #define x264_predict_8x8_init_loongarch x264_template(predict_8x8_init_loongarch) void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); #define x264_predict_8x8c_init_loongarch x264_template(predict_8x8c_init_loongarch) void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] ); #define x264_predict_16x16_init_loongarch x264_template(predict_16x16_init_loongarch) void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] ); #endif x264-master/common/loongarch/quant-a.S000066400000000000000000001211761502133446700200250ustar00rootroot00000000000000/***************************************************************************** * quant-a.S: LoongArch quantization and level-run ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Shiyou Yin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" const last64_shuf .int 0, 4, 1, 5, 2, 6, 3, 7 endconst /* * int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ) */ .macro QUANT_ONE_LASX s1, s2, s3, s4 xvld xr1, \s1, 0 /* Load dctcoef */ xvadda.h \s4, xr1, \s3 xvmuh.hu \s4, \s4, \s2 xvsigncov.h \s4, xr1, \s4 xvst \s4, \s1, 0 .endm function_x264 quant_4x4x4_lasx xvld xr2, a1, 0 xvld xr3, a2, 0 QUANT_ONE_LASX a0, xr2, xr3, xr4 addi.d a0, a0, 32 QUANT_ONE_LASX a0, xr2, xr3, xr0 xvssrlni.h.w xr0, xr4, 0 addi.d a0, a0, 32 QUANT_ONE_LASX a0, xr2, xr3, xr4 addi.d a0, a0, 32 QUANT_ONE_LASX a0, xr2, xr3, xr5 xvssrlni.h.w xr5, xr4, 0 xvssrlni.h.w xr5, xr0, 0 xvseqi.w xr5, xr5, 0 xvmskltz.w xr5, xr5 xvpickve2gr.w t0, xr5, 0 xvpickve2gr.w t1, xr5, 4 alsl.d t0, t1, t0, 4 and t0, t0, t1 xori a0, t0, 0xf endfunc_x264 .macro QUANT_ONE_LSX tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 vld vr0, \tmp1, 0 vld vr1, \tmp1, 16 vadda.h \tmp6, vr0, \tmp4 vadda.h \tmp7, vr1, \tmp5 vmuh.hu \tmp6, \tmp6, \tmp2 vmuh.hu \tmp7, \tmp7, \tmp3 vsigncov.h \tmp6, vr0, \tmp6 vsigncov.h \tmp7, vr1, \tmp7 vst \tmp6, \tmp1, 0 vst \tmp7, \tmp1, 16 .endm function_x264 quant_4x4x4_lsx vld vr2, a1, 0 vld vr3, a1, 16 vld vr4, a2, 0 vld vr5, a2, 16 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr6, vr7 addi.d a0, a0, 32 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr8, vr9 vssrlni.h.w vr8, vr6, 0 vssrlni.h.w vr9, vr7, 0 addi.d a0, a0, 32 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr10, vr11 addi.d a0, a0, 32 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr12, vr13 vssrlni.h.w vr12, vr10, 0 vssrlni.h.w vr13, vr11, 0 vssrlni.h.w vr12, vr8, 0 vssrlni.h.w vr13, vr9, 0 vseqi.w vr12, vr12, 0 vseqi.w vr13, vr13, 0 vmskltz.w vr12, vr12 vmskltz.w vr13, vr13 vpickve2gr.w t0, vr12, 0 vpickve2gr.w t1, vr13, 0 alsl.d t0, t1, t0, 4 and t0, t0, t1 xori a0, t0, 0xf endfunc_x264 function_x264 quant_4x4_lsx vld vr2, a1, 0 vld vr3, a1, 16 vld vr4, a2, 0 vld vr5, a2, 16 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr10, vr11 vor.v vr22, vr10, vr11 vpickve2gr.d t0, vr22, 0 vpickve2gr.d t1, vr22, 1 or t2, t0, t1 addi.w t3, zero, 1 maskeqz a0, t3, t2 endfunc_x264 function_x264 quant_8x8_lsx vld vr2, a1, 0 vld vr3, a1, 16 vld vr4, a2, 0 vld vr5, a2, 16 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr12, vr13 addi.d a0, a0, 32 vld vr2, a1, 32 vld vr3, a1, 48 vld vr4, a2, 32 vld vr5, a2, 48 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr14, vr15 addi.d a0, a0, 32 vld vr2, a1, 64 vld vr3, a1, 80 vld vr4, a2, 64 vld vr5, a2, 80 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr22, vr23 addi.d a0, a0, 32 vld vr2, a1, 96 vld vr3, a1, 112 vld vr4, a2, 96 vld vr5, a2, 112 QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr7, vr8 vor.v vr12, vr12, vr14 vor.v vr13, vr13, vr15 vor.v vr22, vr22, vr7 vor.v vr23, vr23, vr8 vor.v vr12, vr12, vr22 vor.v vr13, vr13, vr23 vor.v vr11, vr12, vr13 vpickve2gr.d t0, vr11, 0 vpickve2gr.d t1, vr11, 1 or t2, t0, t1 addi.w t3, zero, 1 maskeqz a0, t3, t2 endfunc_x264 function_x264 quant_4x4_dc_lsx vld vr0, a0, 0 vld vr1, a0, 16 vreplgr2vr.w vr2, a1 vreplgr2vr.w vr3, a2 vslei.h vr4, vr0, 0 vslei.h vr5, vr1, 0 vexth.w.h vr7, vr0 vsllwil.w.h vr6, vr0, 0 vexth.w.h vr9, vr1 vsllwil.w.h vr8, vr1, 0 vadda.w vr6, vr3, vr6 vadda.w vr7, vr3, vr7 vadda.w vr8, vr3, vr8 vadda.w vr9, vr3, vr9 vmul.w vr6, vr6, vr2 vmul.w vr7, vr7, vr2 vmul.w vr8, vr8, vr2 vmul.w vr9, vr9, vr2 vsrani.h.w vr8, vr6, 16 vsrani.h.w vr9, vr7, 16 vpermi.w vr10, vr9, 0x0E vpermi.w vr9, vr8, 0x44 vpermi.w vr10, vr8, 0x4E vneg.h vr11, vr9 vneg.h vr12, vr10 vbitsel.v vr13, vr9, vr11, vr4 vbitsel.v vr14, vr10, vr12, vr5 vst vr13, a0, 0 vst vr14, a0, 16 vor.v vr15, vr11, vr12 vpickve2gr.d t0, vr15, 0 vpickve2gr.d t1, vr15, 1 or t2, t0, t1 addi.w t3, zero, 1 maskeqz a0, t3, t2 endfunc_x264 /* * int quant_2x2_dc( dctcoef dct[4], int mf, int bias ) */ function_x264 quant_2x2_dc_lsx fld.d f0, a0, 0 vreplgr2vr.w vr1, a1 vreplgr2vr.w vr2, a2 vslei.h vr3, vr0, 0 vsllwil.w.h vr4, vr0, 0 vadda.w vr4, vr4, vr2 vmul.w vr4, vr4, vr1 vsrani.h.w vr4, vr4, 16 vneg.h vr8, vr4 vbitsel.v vr9, vr4, vr8, vr3 vstelm.d vr9, a0, 0, 0 vpickve2gr.w t0, vr9, 0 vpickve2gr.w t1, vr9, 1 or t2, t0, t1 addi.w t3, zero, 1 maskeqz a0, t3, t2 endfunc_x264 /* * int coeff_last64_c(dctcoef *l) */ function_x264 coeff_last64_lasx addi.w t0, zero, 63 xvxor.v xr20, xr0, xr0 xvld xr0, a0, 0 xvld xr1, a0, 32 xvld xr2, a0, 64 xvld xr3, a0, 96 xvldi xr4, 1 la.local t1, last64_shuf xvld xr7, t1, 0 xvldi xr9, 0x408 xvldi xr10, 0x401 xvssrlni.bu.h xr1, xr0, 0 xvssrlni.bu.h xr3, xr2, 0 xvsle.bu xr5, xr4, xr1 xvsle.bu xr6, xr4, xr3 xvssrlni.bu.h xr6, xr5, 4 xvperm.w xr6, xr6, xr7 xvclz.w xr7, xr6 xvssrlni.hu.w xr7, xr7, 2 xvpermi.d xr8, xr7, 0xd8 xvsub.h xr9, xr9, xr8 xvsll.h xr10, xr10, xr9 xvssrlni.bu.h xr10, xr10, 1 xvclz.d xr11, xr10 xvpickve2gr.w t3, xr11, 0 sub.w a0, t0, t3 endfunc_x264 function_x264 coeff_last64_lsx addi.w t0, zero, 63 vxor.v vr20, vr0, vr0 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 vld vr4, a0, 64 vld vr5, a0, 80 vld vr6, a0, 96 vld vr7, a0, 112 vldi vr8, 1 vldi vr9, 0x408 vldi vr10, 0x401 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vssrlni.bu.h vr2, vr2, 0 vssrlni.bu.h vr3, vr3, 0 vssrlni.bu.h vr4, vr4, 0 vssrlni.bu.h vr5, vr5, 0 vssrlni.bu.h vr6, vr6, 0 vssrlni.bu.h vr7, vr7, 0 vpermi.w vr2, vr0, 0x44 vpermi.w vr3, vr1, 0x44 vpermi.w vr6, vr4, 0x44 vpermi.w vr7, vr5, 0x44 vsle.bu vr2, vr8, vr2 vsle.bu vr3, vr8, vr3 vsle.bu vr6, vr8, vr6 vsle.bu vr7, vr8, vr7 vssrlni.bu.h vr2, vr2, 4 vssrlni.bu.h vr3, vr3, 4 vssrlni.bu.h vr6, vr6, 4 vssrlni.bu.h vr7, vr7, 4 vpermi.w vr6, vr2, 0x44 vpermi.w vr7, vr3, 0x44 vpermi.w vr11, vr7, 0x0E vpermi.w vr7, vr6, 0x44 vpermi.w vr7, vr7, 0xD8 vpermi.w vr11, vr6, 0x4E vpermi.w vr11, vr11, 0xD8 vclz.w vr7, vr7 vclz.w vr11, vr11 vssrlni.hu.w vr7, vr7, 2 vssrlni.hu.w vr11, vr11, 2 vpermi.w vr12, vr11, 0x0E vpermi.w vr11, vr7, 0x44 vpermi.w vr12, vr7, 0x4E vsub.h vr11, vr9, vr11 vsub.h vr12, vr9, vr12 vsll.h vr13, vr10, vr11 vsll.h vr14, vr10, vr12 vssrlni.bu.h vr13, vr13, 1 vssrlni.bu.h vr14, vr14, 1 vclz.d vr15, vr14 vpickve2gr.w t1, vr15, 0 sub.w a0, t0, t1 endfunc_x264 /* * int coeff_last16_c(dctcoef *l) */ function_x264 coeff_last16_lasx addi.w t0, zero, 15 xvld xr0, a0, 0 xvldi xr2, 1 xvssrlni.bu.h xr0, xr0, 0 xvpermi.d xr1, xr0, 0xd8 xvsle.bu xr3, xr2, xr1 xvssrlni.bu.h xr3, xr3, 4 xvclz.d xr4, xr3 xvpickve2gr.w t1, xr4, 0 srai.w t1, t1, 2 sub.w a0, t0, t1 endfunc_x264 function_x264 coeff_last16_lsx addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 vldi vr2, 1 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vpermi.w vr1, vr0, 0x44 vsle.bu vr3, vr2, vr1 vssrlni.bu.h vr3, vr3, 4 vclz.d vr4, vr3 vpickve2gr.w t1, vr4, 0 srai.w t1, t1, 2 sub.w a0, t0, t1 endfunc_x264 /* * int coeff_last15_c(dctcoef *l) */ function_x264 coeff_last15_lasx addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 xvldi xr3, 1 vinsgr2vr.h vr1, zero, 7 xvpermi.q xr1, xr0, 0x20 xvssrlni.bu.h xr1, xr1, 0 xvpermi.d xr2, xr1, 0xd8 xvsle.bu xr4, xr3, xr2 xvssrlni.bu.h xr4, xr4, 4 xvclz.d xr5, xr4 xvpickve2gr.w t1, xr5, 0 srai.w t1, t1, 2 sub.w a0, t0, t1 endfunc_x264 function_x264 coeff_last15_lsx addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 vldi vr2, 1 vinsgr2vr.h vr1, zero, 7 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vpermi.w vr1, vr0, 0x44 vsle.bu vr3, vr2, vr1 vssrlni.bu.h vr3, vr3, 4 vclz.d vr4, vr3 vpickve2gr.w t1, vr4, 0 srai.w t1, t1, 2 sub.w a0, t0, t1 endfunc_x264 /* * int coeff_last8_c(dctcoef *l) */ function_x264 coeff_last8_lsx addi.w t0, zero, 7 vld vr0, a0, 0 vclz.d vr1, vr0 vpickve2gr.w t1, vr1, 0 vpickve2gr.w t2, vr1, 2 li.d t3, 64 bne t2, t3, .LAST8_LOW_LSX addi.d t4, t1, 0 addi.d t0, t0, -4 b .LAST8_END_LSX .LAST8_LOW_LSX: addi.d t4, t2, 0 .LAST8_END_LSX: srai.w t4, t4, 4 sub.w a0, t0, t4 endfunc_x264 /* * int coeff_last4_c(dctcoef *l) */ function_x264 coeff_last4_lsx addi.w t0, zero, 3 vld vr0, a0, 0 vclz.d vr1, vr0 vpickve2gr.w t1, vr1, 0 srai.w t1, t1, 4 sub.w a0, t0, t1 endfunc_x264 // (dct[i] * dequant_mf[i]) << (i_qbits) .macro DCT_MF a0, a1, in0, out0, out1 vld vr1, \a0, 0 xvld xr2, \a1, 0 vext2xv.w.h xr5, xr1 xvmul.w xr5, xr5, xr2 xvsll.w \out0, xr5, \in0 vld vr1, \a0, 16 xvld xr2, \a1, 32 vext2xv.w.h xr5, xr1 xvmul.w xr5, xr5, xr2 xvsll.w \out1, xr5, \in0 .endm // (dct[i] * dequant_mf[i] + f) >> (-i_qbits) .macro DCT_MF_F a0, a1, in0, out0, out1 vld vr1, \a0, 0 xvld xr2, \a1, 0 vext2xv.w.h xr5, xr1 xvmul.w xr5, xr5, xr2 xvsrar.w \out0, xr5, \in0 vld vr1, \a0, 16 xvld xr2, \a1, 32 vext2xv.w.h xr5, xr1 xvmul.w xr5, xr5, xr2 xvsrar.w \out1, xr5, \in0 .endm /* * void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) */ function_x264 dequant_4x4_lasx addi.w t1, zero, 6 addi.w t2, zero, 4 div.w t0, a2, t1 sub.w t0, t0, t2 // i_qp/6 - 4 mod.w t1, a2, t1 // i_qp%6 slli.w t1, t1, 6 add.d a1, a1, t1 blt t0, zero, .DQ4x4_DEQUANT_SHR // i_qbits >= 0 xvreplgr2vr.w xr0, t0 DCT_MF a0, a1, xr0, xr6, xr7 b .DQ4x4_END .DQ4x4_DEQUANT_SHR: sub.w t4, zero, t0 xvreplgr2vr.w xr4, t4 DCT_MF_F a0, a1, xr4, xr6, xr7 .DQ4x4_END: xvpickev.h xr8, xr7, xr6 xvpermi.d xr8, xr8, 0xd8 xvst xr8, a0, 0 endfunc_x264 .macro DCT_MF_LSX tmp0, tmp1, in0, out0, out1, out2, out3 vld vr0, \tmp0, 0 vld vr1, \tmp1, 0 vld vr2, \tmp1, 16 vexth.w.h vr4, vr0 vsllwil.w.h vr3, vr0, 0 vmul.w vr3, vr3, vr1 vmul.w vr4, vr4, vr2 vsll.w \out0, vr3, \in0 vsll.w \out1, vr4, \in0 vld vr0, \tmp0, 16 vld vr1, \tmp1, 32 vld vr2, \tmp1, 48 vsllwil.w.h vr3, vr0, 0 vpermi.w vr4, vr0, 0x0E vsllwil.w.h vr4, vr4, 0 vmul.w vr3, vr3, vr1 vmul.w vr4, vr4, vr2 vsll.w \out2, vr3, \in0 vsll.w \out3, vr4, \in0 .endm .macro DCT_MF_F_LSX tmp0, tmp1, in0, out0, out1, out2, out3 vld vr0, \tmp0, 0 vld vr1, \tmp1, 0 vld vr2, \tmp1, 16 vexth.w.h vr4, vr0 vsllwil.w.h vr3, vr0, 0 vmul.w vr3, vr3, vr1 vmul.w vr4, vr4, vr2 vsrar.w \out0, vr3, \in0 vsrar.w \out1, vr4, \in0 vld vr0, \tmp0, 16 vld vr1, \tmp1, 32 vld vr2, \tmp1, 48 vexth.w.h vr4, vr0 vsllwil.w.h vr3, vr0, 0 vmul.w vr3, vr3, vr1 vmul.w vr4, vr4, vr2 vsrar.w \out2, vr3, \in0 vsrar.w \out3, vr4, \in0 .endm function_x264 dequant_4x4_lsx addi.w t1, zero, 6 addi.w t2, zero, 4 div.w t0, a2, t1 sub.w t0, t0, t2 mod.w t1, a2, t1 slli.w t1, t1, 6 add.d a1, a1, t1 blt t0, zero, .DQ4x4_DEQUANT_SHR_LSX vreplgr2vr.w vr6, t0 DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 b .DQ4x4_END_LSX .DQ4x4_DEQUANT_SHR_LSX: sub.w t4, zero, t0 vreplgr2vr.w vr6, t4 DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 .DQ4x4_END_LSX: vpickev.h vr11, vr9, vr7 vpickev.h vr12, vr10, vr8 vpermi.w vr13, vr12, 0x0E vpermi.w vr12, vr11, 0x44 vpermi.w vr13, vr11, 0x4E vst vr12, a0, 0 vst vr13, a0, 16 endfunc_x264 /* * void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp ) */ function_x264 dequant_8x8_lasx addi.w t1, zero, 6 div.w t0, a2, t1 sub.w t0, t0, t1 mod.w t1, a2, t1 // i_qp%6 slli.w t1, t1, 8 add.d a1, a1, t1 blt t0, zero, .DQ8x8_DEQUANT_SHR // i_qbits >= 0 xvreplgr2vr.w xr0, t0 DCT_MF a0, a1, xr0, xr6, xr7 xvpickev.h xr8, xr7, xr6 xvpermi.d xr8, xr8, 0xd8 xvst xr8, a0, 0 .rept 3 addi.d a0, a0, 32 addi.d a1, a1, 64 DCT_MF a0, a1, xr0, xr6, xr7 xvpickev.h xr8, xr7, xr6 xvpermi.d xr8, xr8, 0xd8 xvst xr8, a0, 0 .endr b .DQ8x8_END // i_qbits < 0 .DQ8x8_DEQUANT_SHR: sub.w t4, zero, t0 xvreplgr2vr.w xr4, t4 DCT_MF_F a0, a1, xr4, xr6, xr7 xvpickev.h xr8, xr7, xr6 xvpermi.d xr8, xr8, 0xd8 xvst xr8, a0, 0 .rept 3 addi.d a0, a0, 32 addi.d a1, a1, 64 DCT_MF_F a0, a1, xr4, xr6, xr7 xvpickev.h xr8, xr7, xr6 xvpermi.d xr8, xr8, 0xd8 xvst xr8, a0, 0 .endr .DQ8x8_END: endfunc_x264 function_x264 dequant_8x8_lsx addi.w t1, zero, 6 div.w t0, a2, t1 sub.w t0, t0, t1 mod.w t1, a2, t1 slli.w t1, t1, 8 add.d a1, a1, t1 blt t0, zero, .DQ8x8_DEQUANT_SHR_LSX vreplgr2vr.w vr6, t0 DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 vpickev.h vr11, vr9, vr7 vpickev.h vr12, vr10, vr8 vpermi.w vr13, vr12, 0x0E vpermi.w vr12, vr11, 0x44 vpermi.w vr13, vr11, 0x4E vst vr12, a0, 0 vst vr13, a0, 16 .rept 3 addi.d a0, a0, 32 addi.d a1, a1, 64 DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 vpickev.h vr11, vr9, vr7 vpickev.h vr12, vr10, vr8 vpermi.w vr13, vr12, 0x0E vpermi.w vr12, vr11, 0x44 vpermi.w vr13, vr11, 0x4E vst vr12, a0, 0 vst vr13, a0, 16 .endr b .DQ8x8_END_LSX .DQ8x8_DEQUANT_SHR_LSX: sub.w t4, zero, t0 vreplgr2vr.w vr6, t4 DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 vpickev.h vr11, vr9, vr7 vpickev.h vr12, vr10, vr8 vpermi.w vr13, vr12, 0x0E vpermi.w vr12, vr11, 0x44 vpermi.w vr13, vr11, 0x4E vst vr12, a0, 0 vst vr13, a0, 16 .rept 3 addi.d a0, a0, 32 addi.d a1, a1, 64 DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 vpickev.h vr11, vr9, vr7 vpickev.h vr12, vr10, vr8 vpermi.w vr13, vr12, 0x0E vpermi.w vr12, vr11, 0x44 vpermi.w vr13, vr11, 0x4E vst vr12, a0, 0 vst vr13, a0, 16 .endr .DQ8x8_END_LSX: endfunc_x264 /* * void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) */ function_x264 dequant_4x4_dc_lasx addi.w t0, zero, 6 div.w t1, a2, t0 sub.w t1, t1, t0 blt t1, zero, .DQ4x4DC_LT_ZERO // i_qbits >= 0 mod.w t2, a2, t0 slli.w t2, t2, 6 ldx.w t0, a1, t2 sll.w t0, t0, t1 vld vr1, a0, 0 vld vr10, a0, 16 xvreplgr2vr.w xr2, t0 vext2xv.w.h xr3, xr1 xvmul.w xr6, xr3, xr2 vext2xv.w.h xr3, xr10 xvmul.w xr7, xr3, xr2 b .DQ4x4DC_END // i_qbits < 0 .DQ4x4DC_LT_ZERO: mod.w t2, a2, t0 slli.w t2, t2, 6 ldx.w t0, a1, t2 sub.w t3, zero, t1 vld vr1, a0, 0 vld vr10, a0, 16 xvreplgr2vr.w xr2, t0 xvreplgr2vr.w xr4, t3 vext2xv.w.h xr5, xr1 xvmul.w xr5, xr5, xr2 xvsrar.w xr6, xr5, xr4 vext2xv.w.h xr5, xr10 xvmul.w xr5, xr5, xr2 xvsrar.w xr7, xr5, xr4 .DQ4x4DC_END: xvpickev.h xr8, xr7, xr6 xvpermi.d xr8, xr8, 0xd8 xvst xr8, a0, 0 endfunc_x264 function_x264 dequant_4x4_dc_lsx addi.w t0, zero, 6 div.w t1, a2, t0 sub.w t1, t1, t0 blt t1, zero, .DQ4x4DC_LT_ZERO_LSX mod.w t2, a2, t0 slli.w t2, t2, 6 ldx.w t0, a1, t2 sll.w t0, t0, t1 vld vr1, a0, 0 vld vr2, a0, 16 vreplgr2vr.w vr3, t0 vexth.w.h vr6, vr1 vsllwil.w.h vr5, vr1, 0 vmul.w vr5, vr5, vr3 vmul.w vr6, vr6, vr3 vexth.w.h vr8, vr2 vsllwil.w.h vr7, vr2, 0 vmul.w vr7, vr7, vr3 vmul.w vr8, vr8, vr3 b .DQ4x4DC_END_LSX .DQ4x4DC_LT_ZERO_LSX: mod.w t2, a2, t0 slli.w t2, t2, 6 ldx.w t0, a1, t2 sub.w t3, zero, t1 vld vr1, a0, 0 vld vr2, a0, 16 vreplgr2vr.w vr3, t0 vreplgr2vr.w vr4, t3 vexth.w.h vr6, vr1 vsllwil.w.h vr5, vr1, 0 vexth.w.h vr8, vr2 vsllwil.w.h vr7, vr2, 0 vmul.w vr5, vr5, vr3 vmul.w vr6, vr6, vr3 vmul.w vr7, vr7, vr3 vmul.w vr8, vr8, vr3 vsrar.w vr5, vr5, vr4 vsrar.w vr6, vr6, vr4 vsrar.w vr7, vr7, vr4 vsrar.w vr8, vr8, vr4 .DQ4x4DC_END_LSX: vpickev.h vr9, vr7, vr5 vpickev.h vr10, vr8, vr6 vpermi.w vr11, vr10, 0x0E vpermi.w vr10, vr9, 0x44 vpermi.w vr11, vr9, 0x4E vst vr10, a0, 0 vst vr11, a0, 16 endfunc_x264 /* * int decimate_score15( dctcoef *dct ) */ function_x264 decimate_score15_lsx addi.w t0, zero, 15 la.local t3, x264_decimate_table4 addi.d t4, a0, 2 vld vr0, t4, 0 vld vr1, t4, 16 vldi vr3, 1 vinsgr2vr.h vr1, zero, 7 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vpermi.w vr2, vr1, 0x0E vpermi.w vr1, vr0, 0x44 vpermi.w vr2, vr0, 0x4E vsle.bu vr4, vr3, vr1 vsle.bu vr5, vr3, vr2 vssrlni.bu.h vr4, vr4, 4 vssrlni.bu.h vr5, vr5, 4 vclz.d vr4, vr4 vclz.d vr5, vr5 vpickve2gr.w t1, vr4, 0 srai.w t1, t1, 2 sub.w t2, t0, t1 addi.w t0, zero, 2 move a0, zero slli.d t2, t2, 1 .LOOP_SCORE_15_LSX: blt t2, zero, .END_SCORE_15_LSX ldx.h t5, t4, t2 addi.d t6, t5, 1 bltu t0, t6, .RET_SCORE_15_1_LSX addi.d t2, t2, -2 move t5, zero .WHILE_SCORE_15_LSX: blt t2, zero, .END_WHILE_15_LSX ldx.h t1, t4, t2 bnez t1, .END_WHILE_15_LSX addi.d t2, t2, -2 addi.d t5, t5, 1 b .WHILE_SCORE_15_LSX .END_WHILE_15_LSX: ldx.b t1, t3, t5 add.d a0, a0, t1 b .LOOP_SCORE_15_LSX .RET_SCORE_15_1_LSX: addi.d a0, zero, 9 jirl $r0, $r1, 0x0 .END_SCORE_15_LSX: endfunc_x264 /* * int decimate_score16( dctcoef *dct ) */ function_x264 decimate_score16_lsx addi.w t0, zero, 15 la.local t3, x264_decimate_table4 addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 vldi vr2, 1 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vpermi.w vr3, vr1, 0x0E vpermi.w vr1, vr0, 0x44 vpermi.w vr3, vr0, 0x4E vsle.bu vr4, vr2, vr1 vsle.bu vr5, vr2, vr3 vssrlni.bu.h vr4, vr4, 4 vssrlni.bu.h vr5, vr5, 4 vclz.d vr4, vr4 vclz.d vr5, vr5 vpickve2gr.w t1, vr4, 0 srai.w t1, t1, 2 sub.w t2, t0, t1 move t4, a0 addi.d t0, zero, 2 move a0, zero slli.d t2, t2, 1 .LOOP_SCORE_16_LSX: blt t2, zero, .END_SCORE_16_LSX ldx.h t5, t4, t2 addi.d t6, t5, 1 bltu t0, t6, .RET_SCORE_16_1_LSX addi.d t2, t2, -2 move t5, zero .WHILE_SCORE_16_LSX: blt t2, zero, .END_WHILE_16_LSX ldx.h t1, t4, t2 bnez t1, .END_WHILE_16_LSX addi.d t2, t2, -2 addi.d t5, t5, 1 b .WHILE_SCORE_16_LSX .END_WHILE_16_LSX: ldx.b t1, t3, t5 add.d a0, a0, t1 b .LOOP_SCORE_16_LSX .RET_SCORE_16_1_LSX: addi.d a0, zero, 9 jirl $r0, $r1, 0x0 .END_SCORE_16_LSX: endfunc_x264 /* * int decimate_score64( dctcoef *dct ) */ function_x264 decimate_score64_lsx addi.w t0, zero, 63 la.local t3, x264_decimate_table8 vxor.v vr20, vr0, vr0 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 vld vr4, a0, 64 vld vr5, a0, 80 vld vr6, a0, 96 vld vr7, a0, 112 vldi vr8, 1 vldi vr9, 0x408 vldi vr10, 0x401 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vssrlni.bu.h vr2, vr2, 0 vssrlni.bu.h vr3, vr3, 0 vssrlni.bu.h vr4, vr4, 0 vssrlni.bu.h vr5, vr5, 0 vssrlni.bu.h vr6, vr6, 0 vssrlni.bu.h vr7, vr7, 0 vpermi.w vr2, vr0, 0x44 vpermi.w vr3, vr1, 0x44 vpermi.w vr6, vr4, 0x44 vpermi.w vr7, vr5, 0x44 vsle.bu vr2, vr8, vr2 vsle.bu vr3, vr8, vr3 vsle.bu vr6, vr8, vr6 vsle.bu vr7, vr8, vr7 vssrlni.bu.h vr2, vr2, 4 vssrlni.bu.h vr3, vr3, 4 vssrlni.bu.h vr6, vr6, 4 vssrlni.bu.h vr7, vr7, 4 vpermi.w vr6, vr2, 0x44 vpermi.w vr7, vr3, 0x44 vpermi.w vr11, vr7, 0x0E vpermi.w vr7, vr6, 0x44 vpermi.w vr7, vr7, 0xD8 vpermi.w vr11, vr6, 0x4E vpermi.w vr11, vr11, 0xD8 vclz.w vr7, vr7 vclz.w vr11, vr11 vssrlni.hu.w vr7, vr7, 2 vssrlni.hu.w vr11, vr11, 2 vpermi.w vr12, vr11, 0x0E vpermi.w vr11, vr7, 0x44 vpermi.w vr12, vr7, 0x4E vsub.h vr11, vr9, vr11 vsub.h vr12, vr9, vr12 vsll.h vr13, vr10, vr11 vsll.h vr14, vr10, vr12 vssrlni.bu.h vr13, vr13, 1 vssrlni.bu.h vr14, vr14, 1 vclz.d vr15, vr14 vpickve2gr.w t1, vr15, 0 sub.w t2, t0, t1 move t4, a0 addi.d t0, zero, 2 slli.d t2, t2, 1 move a0, zero .LOOP_SCORE_64_LSX: blt t2, zero, .END_SCORE_64_LSX ldx.h t5, t4, t2 addi.d t6, t5, 1 bltu t0, t6, .RET_SCORE_64_1_LSX addi.d t2, t2, -2 move t5, zero .WHILE_SCORE_64_LSX: blt t2, zero, .END_WHILE_64_LSX ldx.h t1, t4, t2 bnez t1, .END_WHILE_64_LSX addi.d t2, t2, -2 addi.d t5, t5, 1 b .WHILE_SCORE_64_LSX .END_WHILE_64_LSX: ldx.b t1, t3, t5 add.d a0, a0, t1 b .LOOP_SCORE_64_LSX .RET_SCORE_64_1_LSX: addi.d a0, zero, 9 jirl $r0, $r1, 0x0 .END_SCORE_64_LSX: endfunc_x264 /* * int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel ) */ function_x264 coeff_level_run16_lasx addi.w t0, zero, 15 xvld xr0, a0, 0 xvldi xr2, 1 xvssrlni.bu.h xr0, xr0, 0 xvpermi.d xr1, xr0, 0xd8 xvsle.bu xr3, xr2, xr1 xvsrlni.b.h xr3, xr3, 4 xvpickve2gr.du t8, xr3, 0 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit st.w t0, a1, 0x00 // Store runlevel->last addi.d t3, a1, 23 nor t2, zero, zero addi.d t2, t2, -15 and t3, t3, t2 // runlevel->level xor t4, t4, t4 // mask xor t5, t5, t5 // total: number of non-zero elements addi.w t6, zero, 1 // const 1 .LOOP_COEFF_LEVEL_RUN16_LASX: slli.w t7, t0, 1 ldx.h t2, a0, t7 st.h t2, t3, 0 addi.d t3, t3, 2 addi.w t5, t5, 1 sll.w t2, t6, t0 or t4, t4, t2 bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX addi.w t0, t0, -1 slli.w t1, t1, 2 addi.w t1, t1, 4 sll.d t8, t8, t1 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX .END_COEFF_LEVEL_RUN16_LASX: st.w t4, a1, 4 move a0, t5 endfunc_x264 function_x264 coeff_level_run15_lasx addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 xvldi xr3, 1 vinsgr2vr.h vr1, zero, 7 xvpermi.q xr1, xr0, 0x20 xvssrlni.bu.h xr1, xr1, 0 xvpermi.d xr2, xr1, 0xd8 xvsle.bu xr4, xr3, xr2 xvsrlni.b.h xr4, xr4, 4 xvpickve2gr.du t8, xr4, 0 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit st.w t0, a1, 0x00 // Store runlevel->last addi.d t3, a1, 23 nor t2, zero, zero addi.d t2, t2, -15 and t3, t3, t2 // runlevel->level xor t4, t4, t4 // mask xor t5, t5, t5 // total: number of non-zero elements addi.w t6, zero, 1 // const 1 .LOOP_COEFF_LEVEL_RUN15_LASX: slli.w t7, t0, 1 ldx.h t2, a0, t7 st.h t2, t3, 0 addi.d t3, t3, 2 addi.w t5, t5, 1 sll.w t2, t6, t0 or t4, t4, t2 bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX addi.w t0, t0, -1 slli.w t1, t1, 2 addi.w t1, t1, 4 sll.d t8, t8, t1 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX .END_COEFF_LEVEL_RUN15_LASX: st.w t4, a1, 4 move a0, t5 endfunc_x264 function_x264 coeff_level_run16_lsx addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 vldi vr2, 1 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vpermi.w vr1, vr0, 0x44 vsle.bu vr3, vr2, vr1 vsrlni.b.h vr3, vr3, 4 vpickve2gr.du t8, vr3, 0 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit st.w t0, a1, 0x00 // Store runlevel->last addi.d t3, a1, 23 nor t2, zero, zero addi.d t2, t2, -15 and t3, t3, t2 // runlevel->level xor t4, t4, t4 // mask xor t5, t5, t5 // total: number of non-zero elements addi.w t6, zero, 1 // const 1 .LOOP_COEFF_LEVEL_RUN16_LSX: slli.w t7, t0, 1 ldx.h t2, a0, t7 st.h t2, t3, 0 addi.d t3, t3, 2 addi.w t5, t5, 1 sll.w t2, t6, t0 or t4, t4, t2 bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX addi.w t0, t0, -1 slli.w t1, t1, 2 addi.w t1, t1, 4 sll.d t8, t8, t1 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX .END_COEFF_LEVEL_RUN16_LSX: st.w t4, a1, 4 move a0, t5 endfunc_x264 function_x264 coeff_level_run15_lsx addi.w t0, zero, 15 vld vr0, a0, 0 vld vr1, a0, 16 vldi vr2, 1 vinsgr2vr.h vr1, zero, 7 vssrlni.bu.h vr0, vr0, 0 vssrlni.bu.h vr1, vr1, 0 vpermi.w vr1, vr0, 0x44 vsle.bu vr3, vr2, vr1 vsrlni.b.h vr3, vr3, 4 vpickve2gr.du t8, vr3, 0 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit st.w t0, a1, 0x00 // Store runlevel->last addi.d t3, a1, 23 nor t2, zero, zero addi.d t2, t2, -15 and t3, t3, t2 // runlevel->level xor t4, t4, t4 // mask xor t5, t5, t5 // total: number of non-zero elements addi.w t6, zero, 1 // const 1 .LOOP_COEFF_LEVEL_RUN15_LSX: slli.w t7, t0, 1 ldx.h t2, a0, t7 st.h t2, t3, 0 addi.d t3, t3, 2 addi.w t5, t5, 1 sll.w t2, t6, t0 or t4, t4, t2 bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX addi.w t0, t0, -1 slli.w t1, t1, 2 addi.w t1, t1, 4 sll.d t8, t8, t1 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX .END_COEFF_LEVEL_RUN15_LSX: st.w t4, a1, 4 move a0, t5 endfunc_x264 function_x264 coeff_level_run8_lsx addi.w t0, zero, 15 vld vr0, a0, 0 vxor.v vr1, vr1, vr1 vldi vr2, 1 vssrlni.bu.h vr0, vr0, 0 vpermi.w vr1, vr0, 0x44 vsle.bu vr3, vr2, vr1 vsrlni.b.h vr3, vr3, 4 vpickve2gr.du t8, vr3, 0 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit st.w t0, a1, 0x00 // Store runlevel->last addi.d t3, a1, 23 nor t2, zero, zero addi.d t2, t2, -15 and t3, t3, t2 // runlevel->level xor t4, t4, t4 // mask xor t5, t5, t5 // total: number of non-zero elements addi.w t6, zero, 1 // const 1 .LOOP_COEFF_LEVEL_RUN8_LSX: slli.w t7, t0, 1 ldx.h t2, a0, t7 st.h t2, t3, 0 addi.d t3, t3, 2 addi.w t5, t5, 1 sll.w t2, t6, t0 or t4, t4, t2 bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX addi.w t0, t0, -1 slli.w t1, t1, 2 addi.w t1, t1, 4 sll.d t8, t8, t1 clz.d t1, t8 srai.w t1, t1, 2 sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX .END_COEFF_LEVEL_RUN8_LSX: st.w t4, a1, 4 move a0, t5 endfunc_x264 x264-master/common/loongarch/quant.h000066400000000000000000000117001502133446700176230ustar00rootroot00000000000000/***************************************************************************** * quant.h: loongarch quantization and level-run ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Shiyou Yin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_LOONGARCH_QUANT_H #define X264_LOONGARCH_QUANT_H #define x264_coeff_last64_lsx x264_template(coeff_last64_lsx) int32_t x264_coeff_last64_lsx( int16_t *p_src ); #define x264_coeff_last16_lsx x264_template(coeff_last16_lsx) int32_t x264_coeff_last16_lsx( int16_t *p_src ); #define x264_coeff_last15_lsx x264_template(coeff_last15_lsx) int32_t x264_coeff_last15_lsx( int16_t *p_src ); #define x264_coeff_last8_lsx x264_template(coeff_last8_lsx) int32_t x264_coeff_last8_lsx( int16_t *p_src ); #define x264_coeff_last4_lsx x264_template(coeff_last4_lsx) int32_t x264_coeff_last4_lsx( int16_t *p_src ); #define x264_quant_4x4_lsx x264_template(quant_4x4_lsx) int32_t x264_quant_4x4_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); #define x264_quant_4x4x4_lsx x264_template(quant_4x4x4_lsx) int32_t x264_quant_4x4x4_lsx( int16_t p_dct[4][16], uint16_t pu_mf[16], uint16_t pu_bias[16] ); #define x264_quant_8x8_lsx x264_template(quant_8x8_lsx) int32_t x264_quant_8x8_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); #define x264_quant_4x4_dc_lsx x264_template(quant_4x4_dc_lsx) int32_t x264_quant_4x4_dc_lsx( dctcoef dct[16], int32_t mf, int32_t bias ); #define x264_quant_2x2_dc_lsx x264_template(quant_2x2_dc_lsx) int32_t x264_quant_2x2_dc_lsx( dctcoef dct[4], int32_t mf, int32_t bias ); #define x264_dequant_4x4_lsx x264_template(dequant_4x4_lsx) void x264_dequant_4x4_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_lsx x264_template(dequant_8x8_lsx) void x264_dequant_8x8_lsx( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_dc_lsx x264_template(dequant_4x4_dc_lsx) void x264_dequant_4x4_dc_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_decimate_score15_lsx x264_template(decimate_score15_lsx) int x264_decimate_score15_lsx( dctcoef *dct ); #define x264_decimate_score16_lsx x264_template(decimate_score16_lsx) int x264_decimate_score16_lsx( dctcoef *dct ); #define x264_decimate_score64_lsx x264_template(decimate_score64_lsx) int x264_decimate_score64_lsx( dctcoef *dct ); #define x264_coeff_last64_lasx x264_template(coeff_last64_lasx) int32_t x264_coeff_last64_lasx( int16_t *p_src ); #define x264_coeff_last16_lasx x264_template(coeff_last16_lasx) int32_t x264_coeff_last16_lasx( int16_t *p_src ); #define x264_coeff_last15_lasx x264_template(coeff_last15_lasx) int32_t x264_coeff_last15_lasx( int16_t *p_src ); #define x264_quant_4x4x4_lasx x264_template(quant_4x4x4_lasx) int32_t x264_quant_4x4x4_lasx( int16_t p_dct[4][16], uint16_t pu_mf[16], uint16_t pu_bias[16] ); #define x264_dequant_4x4_lasx x264_template(dequant_4x4_lasx) void x264_dequant_4x4_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_lasx x264_template(dequant_8x8_lasx) void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx) void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx) int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx) int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx) int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx) int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx) int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * ); #endif/* X264_LOONGARCH_QUANT_H */ x264-master/common/loongarch/sad-a.S000066400000000000000000003003161502133446700174370ustar00rootroot00000000000000/***************************************************************************** * sad-a.S: loongarch sad functions ***************************************************************************** * Copyright (C) 2023-2025 x264 project * * Authors: Lu Wang * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "loongson_asm.S" #include "loongson_util.S" #if !HIGH_BIT_DEPTH /* void x264_pixel_sad_x4_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x16_lasx slli.d t1, a5, 1 add.d t2, a5, t1 slli.d t3, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 0 xvld xr16, a0, 32 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr12, xr8, xr8 xvhaddw.hu.bu xr13, xr9, xr9 xvhaddw.hu.bu xr14, xr10, xr10 xvhaddw.hu.bu xr15, xr11, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 64 xvld xr16, a0, 96 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 128 xvld xr16, a0, 160 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 192 xvld xr16, a0, 224 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr16, xr4 xvabsd.bu xr9, xr16, xr5 xvabsd.bu xr10, xr16, xr6 xvabsd.bu xr11, xr16, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 xvori.b xr17, xr12, 0 xvori.b xr18, xr13, 0 xvpermi.q xr12, xr14, 0x02 xvpermi.q xr14, xr17, 0x31 xvpermi.q xr13, xr15, 0x02 xvpermi.q xr15, xr18, 0x31 xvadd.h xr12, xr12, xr14 xvadd.h xr13, xr13, xr15 xvhaddw.w.h xr12, xr12, xr12 xvhaddw.w.h xr13, xr13, xr13 xvhaddw.d.w xr12, xr12, xr12 xvhaddw.d.w xr13, xr13, xr13 xvhaddw.q.d xr12, xr12, xr12 xvhaddw.q.d xr13, xr13, xr13 xvpackev.w xr13, xr13, xr12 // Store data to p_sad_array xvstelm.d xr13, a6, 0, 0 xvstelm.d xr13, a6, 8, 2 endfunc_x264 /* void x264_pixel_sad_x4_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x8_lasx slli.d t1, a5, 1 add.d t2, a5, t1 slli.d t3, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 0 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr12, xr8, xr8 xvhaddw.hu.bu xr13, xr9, xr9 xvhaddw.hu.bu xr14, xr10, xr10 xvhaddw.hu.bu xr15, xr11, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 32 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 64 vld vr4, a1, 0 vldx vr8, a1, a5 vld vr5, a2, 0 vldx vr9, a2, a5 vld vr6, a3, 0 vldx vr10, a3, a5 vld vr7, a4, 0 vldx vr11, a4, a5 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 xvld xr3, a0, 96 vldx vr4, a1, t1 vldx vr8, a1, t2 vldx vr5, a2, t1 vldx vr9, a2, t2 vldx vr6, a3, t1 vldx vr10, a3, t2 vldx vr7, a4, t1 vldx vr11, a4, t2 xvpermi.q xr4, xr8, 0x02 xvpermi.q xr5, xr9, 0x02 xvpermi.q xr6, xr10, 0x02 xvpermi.q xr7, xr11, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvadd.h xr12, xr12, xr8 xvadd.h xr13, xr13, xr9 xvadd.h xr14, xr14, xr10 xvadd.h xr15, xr15, xr11 xvori.b xr17, xr12, 0 xvori.b xr18, xr13, 0 xvpermi.q xr12, xr14, 0x02 xvpermi.q xr14, xr17, 0x31 xvpermi.q xr13, xr15, 0x02 xvpermi.q xr15, xr18, 0x31 xvadd.h xr12, xr12, xr14 xvadd.h xr13, xr13, xr15 xvhaddw.w.h xr12, xr12, xr12 xvhaddw.w.h xr13, xr13, xr13 xvhaddw.d.w xr12, xr12, xr12 xvhaddw.d.w xr13, xr13, xr13 xvhaddw.q.d xr12, xr12, xr12 xvhaddw.q.d xr13, xr13, xr13 xvpackev.w xr13, xr13, xr12 // Store data to p_sad_array xvstelm.d xr13, a6, 0, 0 xvstelm.d xr13, a6, 8, 2 endfunc_x264 /* void x264_pixel_sad_x4_8x8_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x8_lasx slli.d t1, a5, 1 add.d t2, t1, a5 slli.d t3, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vilvl.d vr8, vr9, vr8 vilvl.d vr10, vr11, vr10 vilvl.d vr14, vr15, vr14 vilvl.d vr16, vr17, vr16 vilvl.d vr18, vr19, vr18 vilvl.d vr20, vr21, vr20 xvpermi.q xr4, xr6, 0x02 xvpermi.q xr8, xr10, 0x02 xvpermi.q xr14, xr16, 0x02 xvpermi.q xr18, xr20, 0x02 // Calculate the absolute value of the difference xvldrepl.d xr3, a0, 0 xvabsd.bu xr5, xr3, xr4 xvldrepl.d xr3, a0, 16 xvabsd.bu xr9, xr3, xr8 xvldrepl.d xr3, a0, 32 xvabsd.bu xr10, xr3, xr14 xvldrepl.d xr3, a0, 48 xvabsd.bu xr11, xr3, xr18 xvaddwev.h.bu xr0, xr5, xr9 xvaddwod.h.bu xr1, xr5, xr9 xvaddwev.h.bu xr2, xr10, xr11 xvaddwod.h.bu xr22, xr10, xr11 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 add.d a4, a4, t3 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 vilvl.d vr4, vr5, vr4 vilvl.d vr6, vr7, vr6 vilvl.d vr8, vr9, vr8 vilvl.d vr10, vr11, vr10 vilvl.d vr14, vr15, vr14 vilvl.d vr16, vr17, vr16 vilvl.d vr18, vr19, vr18 vilvl.d vr20, vr21, vr20 xvpermi.q xr4, xr6, 0x02 xvpermi.q xr8, xr10, 0x02 xvpermi.q xr14, xr16, 0x02 xvpermi.q xr18, xr20, 0x02 // Calculate the absolute value of the difference xvldrepl.d xr3, a0, 64 xvabsd.bu xr5, xr3, xr4 xvldrepl.d xr3, a0, 80 xvabsd.bu xr9, xr3, xr8 xvldrepl.d xr3, a0, 96 xvabsd.bu xr10, xr3, xr14 xvldrepl.d xr3, a0, 112 xvabsd.bu xr11, xr3, xr18 xvaddwev.h.bu xr12, xr5, xr9 xvaddwod.h.bu xr13, xr5, xr9 xvaddwev.h.bu xr14, xr10, xr11 xvaddwod.h.bu xr15, xr10, xr11 xvadd.h xr5, xr0, xr12 xvadd.h xr9, xr1, xr13 xvadd.h xr10, xr2, xr14 xvadd.h xr11, xr22, xr15 xvadd.h xr5, xr5, xr9 xvadd.h xr10, xr10, xr11 xvadd.h xr10, xr10, xr5 xvhaddw.wu.hu xr10, xr10, xr10 xvhaddw.du.wu xr10, xr10, xr10 xvpermi.q xr5, xr10, 0x01 xvpickev.w xr10, xr5, xr10 // Store data to p_sad_array vst vr10, a6, 0 endfunc_x264 /* void x264_pixel_sad_x4_8x4_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x4_lasx slli.d t1, a5, 1 add.d t2, t1, a5 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 fld.d f2, a0, 0 fld.d f3, a0, 16 fld.d f12, a0, 32 fld.d f13, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 vilvl.d vr3, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr7, vr11, vr7 vilvl.d vr13, vr13, vr12 vilvl.d vr14, vr18, vr14 vilvl.d vr15, vr19, vr15 vilvl.d vr16, vr20, vr16 vilvl.d vr17, vr21, vr17 xvpermi.q xr3, xr13, 0x02 xvpermi.q xr4, xr16, 0x02 xvpermi.q xr5, xr17, 0x02 xvpermi.q xr6, xr14, 0x02 xvpermi.q xr7, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr8, xr3, xr4 xvabsd.bu xr9, xr3, xr5 xvabsd.bu xr10, xr3, xr6 xvabsd.bu xr11, xr3, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvpermi.d xr10, xr10, 0x4e xvpermi.d xr11, xr11, 0x4e xvadd.h xr8, xr8, xr10 xvadd.h xr9, xr9, xr11 xvhaddw.w.h xr8, xr8, xr8 xvhaddw.w.h xr9, xr9, xr9 xvhaddw.d.w xr8, xr8, xr8 xvhaddw.d.w xr9, xr9, xr9 xvhaddw.q.d xr8, xr8, xr8 xvhaddw.q.d xr9, xr9, xr9 xvpackev.w xr9, xr9, xr8 // Store data to p_sad_array xvstelm.d xr9, a6, 0, 0 xvstelm.d xr9, a6, 8, 2 endfunc_x264 /* void x264_pixel_sad_x4_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_4x4_lsx slli.d t0, a5, 1 add.d t1, a5, t0 slli.d t2, a5, 2 // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 fld.s f2, a0, 0 fld.s f3, a0, 16 fld.s f4, a1, 0 fldx.s f8, a1, a5 fld.s f5, a2, 0 fldx.s f9, a2, a5 fld.s f6, a3, 0 fldx.s f10, a3, a5 fld.s f7, a4, 0 fldx.s f11, a4, a5 vilvl.w vr3, vr3, vr2 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr7, vr11, vr7 fld.s f2, a0, 32 fld.s f0, a0, 48 fldx.s f8, a1, t0 fldx.s f12, a1, t1 fldx.s f9, a2, t0 fldx.s f13, a2, t1 fldx.s f10, a3, t0 fldx.s f14, a3, t1 fldx.s f11, a4, t0 fldx.s f15, a4, t1 vilvl.w vr2, vr0, vr2 vilvl.w vr8, vr12, vr8 vilvl.w vr9, vr13, vr9 vilvl.w vr10, vr14, vr10 vilvl.w vr11, vr15, vr11 vilvl.d vr3, vr2, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr7, vr11, vr7 // Calculate the absolute value of the difference vabsd.bu vr8, vr3, vr4 vabsd.bu vr9, vr3, vr5 vabsd.bu vr10, vr3, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.wu.hu vr10, vr10, vr10 vhaddw.wu.hu vr11, vr11, vr11 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.du.wu vr10, vr10, vr10 vhaddw.du.wu vr11, vr11, vr11 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 vhaddw.qu.du vr10, vr10, vr10 vhaddw.qu.du vr11, vr11, vr11 // Store data to p_sad_array vstelm.w vr8, a6, 0, 0 vstelm.w vr9, a6, 4, 0 vstelm.w vr10, a6, 8, 0 vstelm.w vr11, a6, 12, 0 endfunc_x264 /* void x264_pixel_sad_x3_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x16_lasx // Load data from p_src, p_ref0, p_ref1 and p_ref2 slli.d t1, a4, 1 add.d t2, a4, t1 slli.d t3, a4, 2 xvld xr2, a0, 0 xvld xr3, a0, 32 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr16, xr7, xr7 xvhaddw.hu.bu xr17, xr8, xr8 xvhaddw.hu.bu xr18, xr9, xr9 xvhaddw.hu.bu xr19, xr10, xr10 xvhaddw.hu.bu xr20, xr11, xr11 xvhaddw.hu.bu xr21, xr12, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 64 xvld xr3, a0, 96 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 128 xvld xr3, a0, 160 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 192 xvld xr3, a0, 224 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 xvadd.h xr11, xr16, xr19 xvadd.h xr12, xr17, xr20 xvadd.h xr13, xr18, xr21 xvhaddw.wu.hu xr11, xr11, xr11 xvhaddw.wu.hu xr12, xr12, xr12 xvhaddw.wu.hu xr13, xr13, xr13 xvhaddw.du.wu xr11, xr11, xr11 xvhaddw.du.wu xr12, xr12, xr12 xvhaddw.du.wu xr13, xr13, xr13 xvhaddw.qu.du xr11, xr11, xr11 xvhaddw.qu.du xr12, xr12, xr12 xvhaddw.qu.du xr13, xr13, xr13 xvpickve.w xr17, xr11, 4 xvpickve.w xr18, xr12, 4 xvpickve.w xr19, xr13, 4 xvadd.w xr11, xr11, xr17 xvadd.w xr12, xr12, xr18 xvadd.w xr13, xr13, xr19 // Store data to p_sad_array vstelm.w vr11, a5, 0, 0 vstelm.w vr12, a5, 4, 0 vstelm.w vr13, a5, 8, 0 endfunc_x264 /* void x264_pixel_sad_x3_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x8_lasx // Load data from p_src, p_ref0, p_ref1 and p_ref2 slli.d t1, a4, 1 add.d t2, a4, t1 slli.d t3, a4, 2 xvld xr2, a0, 0 xvld xr3, a0, 32 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr16, xr7, xr7 xvhaddw.hu.bu xr17, xr8, xr8 xvhaddw.hu.bu xr18, xr9, xr9 xvhaddw.hu.bu xr19, xr10, xr10 xvhaddw.hu.bu xr20, xr11, xr11 xvhaddw.hu.bu xr21, xr12, xr12 add.d a1, a1, t3 add.d a2, a2, t3 add.d a3, a3, t3 xvld xr2, a0, 64 xvld xr3, a0, 96 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 xvpermi.q xr4, xr7, 0x02 xvpermi.q xr5, xr8, 0x02 xvpermi.q xr6, xr9, 0x02 xvpermi.q xr10, xr13, 0x02 xvpermi.q xr11, xr14, 0x02 xvpermi.q xr12, xr15, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr7, xr2, xr4 xvabsd.bu xr8, xr2, xr5 xvabsd.bu xr9, xr2, xr6 xvabsd.bu xr10, xr3, xr10 xvabsd.bu xr11, xr3, xr11 xvabsd.bu xr12, xr3, xr12 xvhaddw.hu.bu xr7, xr7, xr7 xvhaddw.hu.bu xr8, xr8, xr8 xvhaddw.hu.bu xr9, xr9, xr9 xvhaddw.hu.bu xr10, xr10, xr10 xvhaddw.hu.bu xr11, xr11, xr11 xvhaddw.hu.bu xr12, xr12, xr12 xvadd.h xr16, xr16, xr7 xvadd.h xr17, xr17, xr8 xvadd.h xr18, xr18, xr9 xvadd.h xr19, xr19, xr10 xvadd.h xr20, xr20, xr11 xvadd.h xr21, xr21, xr12 xvadd.h xr11, xr16, xr19 xvadd.h xr12, xr17, xr20 xvadd.h xr13, xr18, xr21 xvhaddw.wu.hu xr11, xr11, xr11 xvhaddw.wu.hu xr12, xr12, xr12 xvhaddw.wu.hu xr13, xr13, xr13 xvhaddw.du.wu xr11, xr11, xr11 xvhaddw.du.wu xr12, xr12, xr12 xvhaddw.du.wu xr13, xr13, xr13 xvhaddw.qu.du xr11, xr11, xr11 xvhaddw.qu.du xr12, xr12, xr12 xvhaddw.qu.du xr13, xr13, xr13 xvpickve.w xr17, xr11, 4 xvpickve.w xr18, xr12, 4 xvpickve.w xr19, xr13, 4 xvadd.w xr11, xr11, xr17 xvadd.w xr12, xr12, xr18 xvadd.w xr13, xr13, xr19 // Store data to p_sad_array vstelm.w vr11, a5, 0, 0 vstelm.w vr12, a5, 4, 0 vstelm.w vr13, a5, 8, 0 endfunc_x264 /* void x264_pixel_sad_x3_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_4x4_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.s f3, a0, 0 fld.s f7, a0, 16 fld.s f11, a0, 32 fld.s f15, a0, 48 FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.w vr3, vr7, vr3 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr11, vr3 vilvl.d vr4, vr12, vr4 vilvl.d vr5, vr13, vr5 vilvl.d vr6, vr14, vr6 // Calculate the absolute value of the difference vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x4_lasx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x4_lasx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr4, vr6, vr4 vilvl.d vr7, vr9, vr7 vilvl.d vr8, vr10, vr8 xvpermi.q xr3, xr7, 0x02 xvpermi.q xr4, xr8, 0x02 // Calculate the absolute value of the difference xvabsd.bu xr5, xr3, xr4 xvhaddw.hu.bu xr6, xr5, xr5 xvhaddw.wu.hu xr6, xr6, xr6 xvhaddw.du.wu xr6, xr6, xr6 xvhaddw.qu.du xr6, xr6, xr6 xvpickve2gr.wu t2, xr6, 0 xvpickve2gr.wu t3, xr6, 4 add.d a0, t2, t3 endfunc_x264 /* int32_t x264_pixel_sad_4x4_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_4x4_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 // Calculate the absolute value of the difference vabsd.bu vr5, vr3, vr4 vhaddw.hu.bu vr6, vr5, vr5 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_4x8_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_4x8_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr11, vr3, vr4 vhaddw.hu.bu vr11, vr11, vr11 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr5, vr3, vr4 vhaddw.hu.bu vr5, vr5, vr5 vadd.h vr6, vr11, vr5 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_4x16_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_4x16_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 // Load data from p_src and p_ref FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr11, vr3, vr4 vhaddw.hu.bu vr11, vr11, vr11 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.w vr3, vr5, vr3 vilvl.w vr4, vr6, vr4 vilvl.w vr7, vr9, vr7 vilvl.w vr8, vr10, vr8 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vabsd.bu vr12, vr3, vr4 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr11, vr11, vr12 .endr vhaddw.wu.hu vr11, vr11, vr11 vhaddw.du.wu vr11, vr11, vr11 vhaddw.qu.du vr11, vr11, vr11 vpickve2gr.wu a0, vr11, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x4_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x4_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr6, vr11, vr12 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x8_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x8_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr13, vr11, vr12 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr6, vr11, vr12 vadd.h vr6, vr6, vr13 vhaddw.wu.hu vr6, vr6, vr6 vhaddw.du.wu vr6, vr6, vr6 vhaddw.qu.du vr6, vr6, vr6 vpickve2gr.wu a0, vr6, 0 endfunc_x264 /* int32_t x264_pixel_sad_8x16_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_8x16_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr13, vr11, vr12 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 vilvl.d vr3, vr5, vr3 vilvl.d vr7, vr9, vr7 vilvl.d vr4, vr6, vr4 vilvl.d vr8, vr10, vr8 vabsd.bu vr11, vr3, vr4 vabsd.bu vr12, vr7, vr8 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vadd.h vr14, vr11, vr12 vadd.h vr13, vr13, vr14 .endr vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu a0, vr13, 0 endfunc_x264 /* int32_t x264_pixel_sad_16x8_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_16x8_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr14, vr8, vr9 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr12, vr8, vr9 vadd.h vr13, vr12, vr14 vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu a0, vr13, 0 endfunc_x264 /* int32_t x264_pixel_sad_16x16_lsx(uint8_t *p_src, intptr_t i_src_stride, * uint8_t *p_ref, intptr_t i_ref_stride) */ function_x264 pixel_sad_16x16_lsx slli.d t1, a1, 1 slli.d t2, a3, 1 add.d t3, a1, t1 add.d t4, a3, t2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr13, vr8, vr9 .rept 3 alsl.d a0, a1, a0, 2 alsl.d a2, a3, a2, 2 LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 vabsd.bu vr8, vr0, vr4 vabsd.bu vr9, vr1, vr5 vabsd.bu vr10, vr2, vr6 vabsd.bu vr11, vr3, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vadd.h vr8, vr8, vr9 vadd.h vr9, vr10, vr11 vadd.h vr12, vr8, vr9 vadd.h vr13, vr12, vr13 .endr vhaddw.wu.hu vr13, vr13, vr13 vhaddw.du.wu vr13, vr13, vr13 vhaddw.qu.du vr13, vr13, vr13 vpickve2gr.wu a0, vr13, 0 endfunc_x264 /* * void x264_pixel_sad_x3_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_4x8_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.s f3, a0, 0 fld.s f7, a0, 16 fld.s f11, a0, 32 fld.s f15, a0, 48 FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.w vr3, vr7, vr3 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr11, vr3 vilvl.d vr4, vr12, vr4 vilvl.d vr5, vr13, vr5 vilvl.d vr6, vr14, vr6 vabsd.bu vr0, vr3, vr4 vabsd.bu vr1, vr3, vr5 vabsd.bu vr2, vr3, vr6 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 fld.s f3, a0, 64 fld.s f7, a0, 80 fld.s f11, a0, 96 fld.s f15, a0, 112 FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.w vr3, vr7, vr3 vilvl.w vr4, vr8, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr11, vr15, vr11 vilvl.w vr12, vr16, vr12 vilvl.w vr13, vr17, vr13 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr11, vr3 vilvl.d vr4, vr12, vr4 vilvl.d vr5, vr13, vr5 vilvl.d vr6, vr14, vr6 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.hu.bu vr2, vr2, vr2 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vadd.h vr7, vr7, vr0 vadd.h vr8, vr8, vr1 vadd.h vr9, vr9, vr2 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_8x4_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr0, vr3, vr4 vabsd.bu vr1, vr3, vr5 vabsd.bu vr2, vr3, vr6 vabsd.bu vr3, vr11, vr12 vabsd.bu vr4, vr11, vr13 vabsd.bu vr5, vr11, vr14 vhaddw.hu.bu vr0, vr0, vr0 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.hu.bu vr2, vr2, vr2 vhaddw.hu.bu vr3, vr3, vr3 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vadd.h vr7, vr0, vr3 vadd.h vr8, vr1, vr4 vadd.h vr9, vr2, vr5 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_8x8_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr0, vr7, vr10 vadd.h vr1, vr8, vr15 vadd.h vr2, vr9, vr16 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 fld.d f3, a0, 64 fld.d f7, a0, 80 fld.d f11, a0, 96 fld.d f15, a0, 112 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr7, vr7, vr10 vadd.h vr8, vr8, vr15 vadd.h vr9, vr9, vr16 vadd.h vr7, vr7, vr0 vadd.h vr8, vr8, vr1 vadd.h vr9, vr9, vr2 vhaddw.wu.hu vr7, vr7, vr7 vhaddw.wu.hu vr8, vr8, vr8 vhaddw.wu.hu vr9, vr9, vr9 vhaddw.du.wu vr7, vr7, vr7 vhaddw.du.wu vr8, vr8, vr8 vhaddw.du.wu vr9, vr9, vr9 vhaddw.qu.du vr7, vr7, vr7 vhaddw.qu.du vr8, vr8, vr8 vhaddw.qu.du vr9, vr9, vr9 // Store data to p_sad_array vstelm.w vr7, a5, 0, 0 vstelm.w vr8, a5, 4, 0 vstelm.w vr9, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_8x16_lsx slli.d t1, a4, 1 add.d t2, a4, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr0, vr7, vr10 vadd.h vr1, vr8, vr15 vadd.h vr2, vr9, vr16 .rept 3 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 addi.d a0, a0, 64 fld.d f3, a0, 0 fld.d f7, a0, 16 fld.d f11, a0, 32 fld.d f15, a0, 48 FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 vilvl.d vr3, vr7, vr3 vilvl.d vr4, vr8, vr4 vilvl.d vr5, vr9, vr5 vilvl.d vr6, vr10, vr6 vilvl.d vr11, vr15, vr11 vilvl.d vr12, vr16, vr12 vilvl.d vr13, vr17, vr13 vilvl.d vr14, vr18, vr14 vabsd.bu vr7, vr3, vr4 vabsd.bu vr8, vr3, vr5 vabsd.bu vr9, vr3, vr6 vabsd.bu vr10, vr11, vr12 vabsd.bu vr15, vr11, vr13 vabsd.bu vr16, vr11, vr14 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vadd.h vr7, vr7, vr10 vadd.h vr8, vr8, vr15 vadd.h vr9, vr9, vr16 vadd.h vr0, vr7, vr0 vadd.h vr1, vr8, vr1 vadd.h vr2, vr9, vr2 .endr vhaddw.wu.hu vr0, vr0, vr0 vhaddw.wu.hu vr1, vr1, vr1 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.du.wu vr0, vr0, vr0 vhaddw.du.wu vr1, vr1, vr1 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr0, vr0, vr0 vhaddw.qu.du vr1, vr1, vr1 vhaddw.qu.du vr2, vr2, vr2 // Store data to p_sad_array vstelm.w vr0, a5, 0, 0 vstelm.w vr1, a5, 4, 0 vstelm.w vr2, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x8_lsx slli.d t1, a4, 1 add.d t2, a4, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr16, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr17, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr18, vr1, vr0 // vr16, vr17, vr18 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 vld vr0, a0, 64 vld vr1, a0, 80 vld vr2, a0, 96 vld vr3, a0, 112 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr2, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr3, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr4, vr1, vr0 vadd.h vr0, vr16, vr2 vadd.h vr1, vr17, vr3 vadd.h vr2, vr18, vr4 vhaddw.wu.hu vr0, vr0, vr0 vhaddw.wu.hu vr1, vr1, vr1 vhaddw.wu.hu vr2, vr2, vr2 vhaddw.du.wu vr0, vr0, vr0 vhaddw.du.wu vr1, vr1, vr1 vhaddw.du.wu vr2, vr2, vr2 vhaddw.qu.du vr0, vr0, vr0 vhaddw.qu.du vr1, vr1, vr1 vhaddw.qu.du vr2, vr2, vr2 // Store data to p_sad_array vstelm.w vr0, a5, 0, 0 vstelm.w vr1, a5, 4, 0 vstelm.w vr2, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x3_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * intptr_t i_ref_stride, * int32_t p_sad_array[3]) */ function_x264 pixel_sad_x3_16x16_lsx slli.d t1, a4, 1 add.d t2, a4, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr16, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr17, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr18, vr1, vr0 .rept 3 alsl.d a1, a4, a1, 2 alsl.d a2, a4, a2, 2 alsl.d a3, a4, a3, 2 addi.d a0, a0, 64 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr1, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr2, vr10 vabsd.bu vr11, vr2, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr3, vr13 vabsd.bu vr14, vr3, vr14 vabsd.bu vr15, vr3, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr0, vr7, vr4 vadd.h vr1, vr13, vr10 vadd.h vr2, vr1, vr0 vadd.h vr0, vr8, vr5 vadd.h vr1, vr14, vr11 vadd.h vr3, vr1, vr0 vadd.h vr0, vr9, vr6 vadd.h vr1, vr15, vr12 vadd.h vr4, vr1, vr0 vadd.h vr16, vr16, vr2 vadd.h vr17, vr17, vr3 vadd.h vr18, vr18, vr4 .endr vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 // Store data to p_sad_array vstelm.w vr16, a5, 0, 0 vstelm.w vr17, a5, 4, 0 vstelm.w vr18, a5, 8, 0 endfunc_x264 /* * void x264_pixel_sad_x4_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_4x8_lsx slli.d t1, a5, 1 add.d t2, a5, t1 fld.s f0, a0, 0 fld.s f1, a0, 16 fld.s f2, a0, 32 fld.s f3, a0, 48 FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.w vr0, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.d vr0, vr2, vr0 vilvl.w vr4, vr8, vr4 vilvl.w vr12, vr16, vr12 vilvl.d vr1, vr12, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr13, vr17, vr13 vilvl.d vr2, vr13, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr14, vr6 vilvl.w vr7, vr11, vr7 vilvl.w vr15, vr19, vr15 vilvl.d vr4, vr15, vr7 vabsd.bu vr1, vr0, vr1 vabsd.bu vr2, vr0, vr2 vabsd.bu vr3, vr0, vr3 vabsd.bu vr4, vr0, vr4 vhaddw.hu.bu vr20, vr1, vr1 vhaddw.hu.bu vr21, vr2, vr2 vhaddw.hu.bu vr22, vr3, vr3 vhaddw.hu.bu vr23, vr4, vr4 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 fld.s f0, a0, 64 fld.s f1, a0, 80 fld.s f2, a0, 96 fld.s f3, a0, 112 FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.w vr0, vr1, vr0 vilvl.w vr2, vr3, vr2 vilvl.d vr0, vr2, vr0 vilvl.w vr4, vr8, vr4 vilvl.w vr12, vr16, vr12 vilvl.d vr1, vr12, vr4 vilvl.w vr5, vr9, vr5 vilvl.w vr13, vr17, vr13 vilvl.d vr2, vr13, vr5 vilvl.w vr6, vr10, vr6 vilvl.w vr14, vr18, vr14 vilvl.d vr3, vr14, vr6 vilvl.w vr7, vr11, vr7 vilvl.w vr15, vr19, vr15 vilvl.d vr4, vr15, vr7 vabsd.bu vr1, vr0, vr1 vabsd.bu vr2, vr0, vr2 vabsd.bu vr3, vr0, vr3 vabsd.bu vr4, vr0, vr4 vhaddw.hu.bu vr1, vr1, vr1 vhaddw.hu.bu vr2, vr2, vr2 vhaddw.hu.bu vr3, vr3, vr3 vhaddw.hu.bu vr4, vr4, vr4 vadd.h vr16, vr20, vr1 vadd.h vr17, vr21, vr2 vadd.h vr18, vr22, vr3 vadd.h vr19, vr23, vr4 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.du.wu vr19, vr19, vr19 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 vhaddw.qu.du vr19, vr19, vr19 // Store data to p_sad_array vstelm.w vr16, a6, 0, 0 vstelm.w vr17, a6, 4, 0 vstelm.w vr18, a6, 8, 0 vstelm.w vr19, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x4_lsx slli.d t1, a5, 1 add.d t2, a5, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr16, vr4, vr12 vadd.h vr17, vr5, vr13 vadd.h vr18, vr6, vr14 vadd.h vr19, vr7, vr15 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.du.wu vr19, vr19, vr19 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 vhaddw.qu.du vr19, vr19, vr19 // Store data to p_sad_array vstelm.w vr16, a6, 0, 0 vstelm.w vr17, a6, 4, 0 vstelm.w vr18, a6, 8, 0 vstelm.w vr19, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x8_lsx slli.d t1, a5, 1 add.d t2, a5, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr20, vr4, vr12 vadd.h vr21, vr5, vr13 vadd.h vr22, vr6, vr14 vadd.h vr23, vr7, vr15 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 fld.d f0, a0, 64 fld.d f1, a0, 80 fld.d f2, a0, 96 fld.d f3, a0, 112 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr16, vr4, vr12 vadd.h vr17, vr5, vr13 vadd.h vr18, vr6, vr14 vadd.h vr19, vr7, vr15 vadd.h vr16, vr16, vr20 vadd.h vr17, vr17, vr21 vadd.h vr18, vr18, vr22 vadd.h vr19, vr19, vr23 vhaddw.wu.hu vr16, vr16, vr16 vhaddw.wu.hu vr17, vr17, vr17 vhaddw.wu.hu vr18, vr18, vr18 vhaddw.wu.hu vr19, vr19, vr19 vhaddw.du.wu vr16, vr16, vr16 vhaddw.du.wu vr17, vr17, vr17 vhaddw.du.wu vr18, vr18, vr18 vhaddw.du.wu vr19, vr19, vr19 vhaddw.qu.du vr16, vr16, vr16 vhaddw.qu.du vr17, vr17, vr17 vhaddw.qu.du vr18, vr18, vr18 vhaddw.qu.du vr19, vr19, vr19 // Store data to p_sad_array vstelm.w vr16, a6, 0, 0 vstelm.w vr17, a6, 4, 0 vstelm.w vr18, a6, 8, 0 vstelm.w vr19, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_8x16_lsx slli.d t1, a5, 1 add.d t2, a5, t1 // Load data from p_src, p_ref0, p_ref1 and p_ref2 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr20, vr4, vr12 vadd.h vr21, vr5, vr13 vadd.h vr22, vr6, vr14 vadd.h vr23, vr7, vr15 .rept 3 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 addi.d a0, a0, 64 fld.d f0, a0, 0 fld.d f1, a0, 16 fld.d f2, a0, 32 fld.d f3, a0, 48 FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 vilvl.d vr0, vr1, vr0 vilvl.d vr2, vr3, vr2 vilvl.d vr4, vr8, vr4 vilvl.d vr12, vr16, vr12 vilvl.d vr5, vr9, vr5 vilvl.d vr13, vr17, vr13 vilvl.d vr6, vr10, vr6 vilvl.d vr14, vr18, vr14 vilvl.d vr7, vr11, vr7 vilvl.d vr15, vr19, vr15 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vadd.h vr16, vr4, vr12 vadd.h vr17, vr5, vr13 vadd.h vr18, vr6, vr14 vadd.h vr19, vr7, vr15 vadd.h vr20, vr16, vr20 vadd.h vr21, vr17, vr21 vadd.h vr22, vr18, vr22 vadd.h vr23, vr19, vr23 .endr vhaddw.wu.hu vr20, vr20, vr20 vhaddw.wu.hu vr21, vr21, vr21 vhaddw.wu.hu vr22, vr22, vr22 vhaddw.wu.hu vr23, vr23, vr23 vhaddw.du.wu vr20, vr20, vr20 vhaddw.du.wu vr21, vr21, vr21 vhaddw.du.wu vr22, vr22, vr22 vhaddw.du.wu vr23, vr23, vr23 vhaddw.qu.du vr20, vr20, vr20 vhaddw.qu.du vr21, vr21, vr21 vhaddw.qu.du vr22, vr22, vr22 vhaddw.qu.du vr23, vr23, vr23 // Store data to p_sad_array vstelm.w vr20, a6, 0, 0 vstelm.w vr21, a6, 4, 0 vstelm.w vr22, a6, 8, 0 vstelm.w vr23, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x8_lsx slli.d t1, a5, 1 add.d t2, a5, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr20, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr21, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr22, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr23, vr0, vr1 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 vld vr0, a0, 64 vld vr1, a0, 80 vld vr2, a0, 96 vld vr3, a0, 112 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr16, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr17, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr18, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr19, vr0, vr1 vadd.h vr20, vr16, vr20 vadd.h vr21, vr17, vr21 vadd.h vr22, vr18, vr22 vadd.h vr23, vr19, vr23 vhaddw.wu.hu vr20, vr20, vr20 vhaddw.wu.hu vr21, vr21, vr21 vhaddw.wu.hu vr22, vr22, vr22 vhaddw.wu.hu vr23, vr23, vr23 vhaddw.du.wu vr20, vr20, vr20 vhaddw.du.wu vr21, vr21, vr21 vhaddw.du.wu vr22, vr22, vr22 vhaddw.du.wu vr23, vr23, vr23 vhaddw.qu.du vr20, vr20, vr20 vhaddw.qu.du vr21, vr21, vr21 vhaddw.qu.du vr22, vr22, vr22 vhaddw.qu.du vr23, vr23, vr23 // Store data to p_sad_array vstelm.w vr20, a6, 0, 0 vstelm.w vr21, a6, 4, 0 vstelm.w vr22, a6, 8, 0 vstelm.w vr23, a6, 12, 0 endfunc_x264 /* * void x264_pixel_sad_x4_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, * uint8_t *p_ref1, uint8_t *p_ref2, * uint8_t *p_ref3, intptr_t i_ref_stride, * int32_t p_sad_array[4]) */ function_x264 pixel_sad_x4_16x16_lsx slli.d t1, a5, 1 add.d t2, a5, t1 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr20, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr21, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr22, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr23, vr0, vr1 .rept 3 alsl.d a1, a5, a1, 2 alsl.d a2, a5, a2, 2 alsl.d a3, a5, a3, 2 alsl.d a4, a5, a4, 2 addi.d a0, a0, 64 vld vr0, a0, 0 vld vr1, a0, 16 vld vr2, a0, 32 vld vr3, a0, 48 LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 vabsd.bu vr4, vr0, vr4 vabsd.bu vr5, vr0, vr5 vabsd.bu vr6, vr0, vr6 vabsd.bu vr7, vr0, vr7 vabsd.bu vr8, vr1, vr8 vabsd.bu vr9, vr1, vr9 vabsd.bu vr10, vr1, vr10 vabsd.bu vr11, vr1, vr11 vabsd.bu vr12, vr2, vr12 vabsd.bu vr13, vr2, vr13 vabsd.bu vr14, vr2, vr14 vabsd.bu vr15, vr2, vr15 vabsd.bu vr16, vr3, vr16 vabsd.bu vr17, vr3, vr17 vabsd.bu vr18, vr3, vr18 vabsd.bu vr19, vr3, vr19 vhaddw.hu.bu vr4, vr4, vr4 vhaddw.hu.bu vr5, vr5, vr5 vhaddw.hu.bu vr6, vr6, vr6 vhaddw.hu.bu vr7, vr7, vr7 vhaddw.hu.bu vr8, vr8, vr8 vhaddw.hu.bu vr9, vr9, vr9 vhaddw.hu.bu vr10, vr10, vr10 vhaddw.hu.bu vr11, vr11, vr11 vhaddw.hu.bu vr12, vr12, vr12 vhaddw.hu.bu vr13, vr13, vr13 vhaddw.hu.bu vr14, vr14, vr14 vhaddw.hu.bu vr15, vr15, vr15 vhaddw.hu.bu vr16, vr16, vr16 vhaddw.hu.bu vr17, vr17, vr17 vhaddw.hu.bu vr18, vr18, vr18 vhaddw.hu.bu vr19, vr19, vr19 vadd.h vr0, vr4, vr8 vadd.h vr1, vr12, vr16 vadd.h vr16, vr0, vr1 vadd.h vr0, vr5, vr9 vadd.h vr1, vr13, vr17 vadd.h vr17, vr0, vr1 vadd.h vr0, vr6, vr10 vadd.h vr1, vr14, vr18 vadd.h vr18, vr0, vr1 vadd.h vr0, vr7, vr11 vadd.h vr1, vr15, vr19 vadd.h vr19, vr0, vr1 vadd.h vr20, vr16, vr20 vadd.h vr21, vr17, vr21 vadd.h vr22, vr18, vr22 vadd.h vr23, vr19, vr23 .endr vhaddw.wu.hu vr20, vr20, vr20 vhaddw.wu.hu vr21, vr21, vr21 vhaddw.wu.hu vr22, vr22, vr22 vhaddw.wu.hu vr23, vr23, vr23 vhaddw.du.wu vr20, vr20, vr20 vhaddw.du.wu vr21, vr21, vr21 vhaddw.du.wu vr22, vr22, vr22 vhaddw.du.wu vr23, vr23, vr23 vhaddw.qu.du vr20, vr20, vr20 vhaddw.qu.du vr21, vr21, vr21 vhaddw.qu.du vr22, vr22, vr22 vhaddw.qu.du vr23, vr23, vr23 // Store data to p_sad_array vstelm.w vr20, a6, 0, 0 vstelm.w vr21, a6, 4, 0 vstelm.w vr22, a6, 8, 0 vstelm.w vr23, a6, 12, 0 endfunc_x264 #endif /* !HIGH_BIT_DEPTH */ x264-master/common/macroblock.c000066400000000000000000002404221502133446700166330ustar00rootroot00000000000000/***************************************************************************** * macroblock.c: macroblock common functions ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Fiona Glaser * Laurent Aimar * Loren Merritt * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #define MC_LUMA(list,p) \ h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \ &h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \ mvx, mvy, 4*width, 4*height, \ list ? x264_weight_none : &h->sh.weight[i_ref][p] ); static NOINLINE void mb_mc_0xywh( x264_t *h, int x, int y, int width, int height ) { int i8 = x264_scan8[0]+x+8*y; int i_ref = h->mb.cache.ref[0][i8]; int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; MC_LUMA( 0, 0 ); if( CHROMA444 ) { MC_LUMA( 0, 1 ); MC_LUMA( 0, 2 ); } else if( CHROMA_FORMAT ) { int v_shift = CHROMA_V_SHIFT; // Chroma in 4:2:0 is offset if MCing from a field of opposite parity if( v_shift & MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; height = 4*height >> v_shift; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset], &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], mvx, 2*mvy>>v_shift, 2*width, height ); if( h->sh.weight[i_ref][1].weightfn ) h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, &h->sh.weight[i_ref][1], height ); if( h->sh.weight[i_ref][2].weightfn ) h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, &h->sh.weight[i_ref][2], height ); } } static NOINLINE void mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) { int i8 = x264_scan8[0]+x+8*y; int i_ref = h->mb.cache.ref[1][i8]; int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; MC_LUMA( 1, 0 ); if( CHROMA444 ) { MC_LUMA( 1, 1 ); MC_LUMA( 1, 2 ); } else if( CHROMA_FORMAT ) { int v_shift = CHROMA_V_SHIFT; if( v_shift & MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset], &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift ); } } #define MC_LUMA_BI(p) \ src0 = h->mc.get_ref( tmp0, &i_stride0, &h->mb.pic.p_fref[0][i_ref0][p*4], h->mb.pic.i_stride[p], \ mvx0, mvy0, 4*width, 4*height, x264_weight_none ); \ src1 = h->mc.get_ref( tmp1, &i_stride1, &h->mb.pic.p_fref[1][i_ref1][p*4], h->mb.pic.i_stride[p], \ mvx1, mvy1, 4*width, 4*height, x264_weight_none ); \ h->mc.avg[i_mode]( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \ src0, i_stride0, src1, i_stride1, weight ); static NOINLINE void mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) { int i8 = x264_scan8[0]+x+8*y; int i_ref0 = h->mb.cache.ref[0][i8]; int i_ref1 = h->mb.cache.ref[1][i8]; int weight = h->mb.bipred_weight[i_ref0][i_ref1]; int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; intptr_t i_stride0 = 16, i_stride1 = 16; ALIGNED_ARRAY_32( pixel, tmp0,[16*16] ); ALIGNED_ARRAY_32( pixel, tmp1,[16*16] ); pixel *src0, *src1; MC_LUMA_BI( 0 ); if( CHROMA444 ) { MC_LUMA_BI( 1 ); MC_LUMA_BI( 2 ); } else if( CHROMA_FORMAT ) { int v_shift = CHROMA_V_SHIFT; if( v_shift & MB_INTERLACED & i_ref0 ) mvy0 += (h->mb.i_mb_y & 1)*4 - 2; if( v_shift & MB_INTERLACED & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift ); h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift ); int chromapix = h->luma2chroma_pixel[i_mode]; int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); } } #undef MC_LUMA #undef MC_LUMA_BI void x264_mb_mc_8x8( x264_t *h, int i8 ) { int x = 2*(i8&1); int y = 2*(i8>>1); if( h->sh.i_type == SLICE_TYPE_P ) { switch( h->mb.i_sub_partition[i8] ) { case D_L0_8x8: mb_mc_0xywh( h, x, y, 2, 2 ); break; case D_L0_8x4: mb_mc_0xywh( h, x, y+0, 2, 1 ); mb_mc_0xywh( h, x, y+1, 2, 1 ); break; case D_L0_4x8: mb_mc_0xywh( h, x+0, y, 1, 2 ); mb_mc_0xywh( h, x+1, y, 1, 2 ); break; case D_L0_4x4: mb_mc_0xywh( h, x+0, y+0, 1, 1 ); mb_mc_0xywh( h, x+1, y+0, 1, 1 ); mb_mc_0xywh( h, x+0, y+1, 1, 1 ); mb_mc_0xywh( h, x+1, y+1, 1, 1 ); break; } } else { int scan8 = x264_scan8[0] + x + 8*y; if( h->mb.cache.ref[0][scan8] >= 0 ) if( h->mb.cache.ref[1][scan8] >= 0 ) mb_mc_01xywh( h, x, y, 2, 2 ); else mb_mc_0xywh( h, x, y, 2, 2 ); else mb_mc_1xywh( h, x, y, 2, 2 ); } } void x264_mb_mc( x264_t *h ) { if( h->mb.i_partition == D_8x8 ) { for( int i = 0; i < 4; i++ ) x264_mb_mc_8x8( h, i ); } else { int ref0a = h->mb.cache.ref[0][x264_scan8[ 0]]; int ref0b = h->mb.cache.ref[0][x264_scan8[12]]; int ref1a = h->mb.cache.ref[1][x264_scan8[ 0]]; int ref1b = h->mb.cache.ref[1][x264_scan8[12]]; if( h->mb.i_partition == D_16x16 ) { if( ref0a >= 0 ) if( ref1a >= 0 ) mb_mc_01xywh( h, 0, 0, 4, 4 ); else mb_mc_0xywh ( h, 0, 0, 4, 4 ); else mb_mc_1xywh ( h, 0, 0, 4, 4 ); } else if( h->mb.i_partition == D_16x8 ) { if( ref0a >= 0 ) if( ref1a >= 0 ) mb_mc_01xywh( h, 0, 0, 4, 2 ); else mb_mc_0xywh ( h, 0, 0, 4, 2 ); else mb_mc_1xywh ( h, 0, 0, 4, 2 ); if( ref0b >= 0 ) if( ref1b >= 0 ) mb_mc_01xywh( h, 0, 2, 4, 2 ); else mb_mc_0xywh ( h, 0, 2, 4, 2 ); else mb_mc_1xywh ( h, 0, 2, 4, 2 ); } else if( h->mb.i_partition == D_8x16 ) { if( ref0a >= 0 ) if( ref1a >= 0 ) mb_mc_01xywh( h, 0, 0, 2, 4 ); else mb_mc_0xywh ( h, 0, 0, 2, 4 ); else mb_mc_1xywh ( h, 0, 0, 2, 4 ); if( ref0b >= 0 ) if( ref1b >= 0 ) mb_mc_01xywh( h, 2, 0, 2, 4 ); else mb_mc_0xywh ( h, 2, 0, 2, 4 ); else mb_mc_1xywh ( h, 2, 0, 2, 4 ); } } } int x264_macroblock_cache_allocate( x264_t *h ) { int i_mb_count = h->mb.i_mb_count; h->mb.i_mb_stride = h->mb.i_mb_width; h->mb.i_b8_stride = h->mb.i_mb_width * 2; h->mb.i_b4_stride = h->mb.i_mb_width * 4; h->mb.b_interlaced = PARAM_INTERLACED; PREALLOC_INIT PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) ); PREALLOC( h->mb.slice_table, i_mb_count * sizeof(int32_t) ); /* 0 -> 3 top(4), 4 -> 6 : left(3) */ PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) ); /* all coeffs */ PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) ); if( h->param.b_cabac ) { PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) ); PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) ); PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) ); if( h->param.i_bframe ) PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) ); } for( int i = 0; i < 2; i++ ) { int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit for( int j = !i; j < i_refs; j++ ) PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) ); } if( h->param.analyse.i_weighted_pred ) { int i_padv = PADV << PARAM_INTERLACED; int luma_plane_size = 0; int numweightbuf; if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) { // only need buffer for lookahead if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] ) { // Fake analysis only works on lowres luma_plane_size = h->fdec->i_stride_lowres * (h->mb.i_mb_height*8+2*i_padv); // Only need 1 buffer for analysis numweightbuf = 1; } else numweightbuf = 0; } else { /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4 * needs the same amount of space and 4:2:2 needs twice that much */ luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv); if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) //smart can weight one ref and one offset -1 in 8-bit numweightbuf = 1 + (BIT_DEPTH == 8); else //simple only has one weighted ref numweightbuf = 1; } for( int i = 0; i < numweightbuf; i++ ) PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * SIZEOF_PIXEL ); } PREALLOC_END( h->mb.base ); memset( h->mb.slice_table, -1, i_mb_count * sizeof(int32_t) ); for( int i = 0; i < 2; i++ ) { int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit for( int j = !i; j < i_refs; j++ ) { M32( h->mb.mvr[i][j][0] ) = 0; h->mb.mvr[i][j]++; } } return 0; fail: return -1; } void x264_macroblock_cache_free( x264_t *h ) { x264_free( h->mb.base ); } int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) { if( !b_lookahead ) { for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ ) for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ ) { CHECKED_MALLOC( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * SIZEOF_PIXEL ); h->intra_border_backup[i][j] += 16; } for( int i = 0; i <= PARAM_INTERLACED; i++ ) { if( h->param.b_sliced_threads ) { /* Only allocate the first one, and allocate it for the whole frame, because we * won't be deblocking until after the frame is fully encoded. */ if( h == h->thread[0] && !i ) CHECKED_MALLOC( h->deblock_strength[0], sizeof(**h->deblock_strength) * h->mb.i_mb_count ); else h->deblock_strength[i] = h->thread[0]->deblock_strength[0]; } else CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width ); h->deblock_strength[1] = h->deblock_strength[i]; } } /* Allocate scratch buffer */ int scratch_size = 0; if( !b_lookahead ) { int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t); int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } int buf_mbtree = h->param.rc.b_mb_tree * ALIGN( h->mb.i_mb_width * sizeof(int16_t), NATIVE_ALIGN ); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); else h->scratch_buffer = NULL; int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2; int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */ scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 ); CHECKED_MALLOC( h->scratch_buffer2, scratch_size ); return 0; fail: return -1; } void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) { if( !b_lookahead ) { for( int i = 0; i <= PARAM_INTERLACED; i++ ) if( !h->param.b_sliced_threads || (h == h->thread[0] && !i) ) x264_free( h->deblock_strength[i] ); for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ ) for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ ) x264_free( h->intra_border_backup[i][j] - 16 ); } x264_free( h->scratch_buffer ); x264_free( h->scratch_buffer2 ); } void x264_macroblock_slice_init( x264_t *h ) { h->mb.mv[0] = h->fdec->mv[0]; h->mb.mv[1] = h->fdec->mv[1]; h->mb.mvr[0][0] = h->fdec->mv16x16; h->mb.ref[0] = h->fdec->ref[0]; h->mb.ref[1] = h->fdec->ref[1]; h->mb.type = h->fdec->mb_type; h->mb.partition = h->fdec->mb_partition; h->mb.field = h->fdec->field; h->fdec->i_ref[0] = h->i_ref[0]; h->fdec->i_ref[1] = h->i_ref[1]; for( int i = 0; i < h->i_ref[0]; i++ ) h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc; if( h->sh.i_type == SLICE_TYPE_B ) { for( int i = 0; i < h->i_ref[1]; i++ ) h->fdec->ref_poc[1][i] = h->fref[1][i]->i_poc; map_col_to_list0(-1) = -1; map_col_to_list0(-2) = -2; for( int i = 0; i < h->fref[1][0]->i_ref[0]; i++ ) { int poc = h->fref[1][0]->ref_poc[0][i]; map_col_to_list0(i) = -2; for( int j = 0; j < h->i_ref[0]; j++ ) if( h->fref[0][j]->i_poc == poc ) { map_col_to_list0(i) = j; break; } } } else if( h->sh.i_type == SLICE_TYPE_P ) { if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) { deblock_ref_table(-2) = -2; deblock_ref_table(-1) = -1; for( int i = 0; i < h->i_ref[0] << SLICE_MBAFF; i++ ) { /* Mask off high bits to avoid frame num collisions with -1/-2. * In current x264 frame num values don't cover a range of more * than 32, so 6 bits is enough for uniqueness. */ if( !MB_INTERLACED ) deblock_ref_table(i) = h->fref[0][i]->i_frame_num&63; else deblock_ref_table(i) = ((h->fref[0][i>>1]->i_frame_num&63)<<1) + (i&1); } } } /* init with not available (for top right idx=7,15) */ memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) ); if( h->i_ref[0] > 0 ) for( int field = 0; field <= SLICE_MBAFF; field++ ) { int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; int refpoc = h->fref[0][0]->i_poc + h->fref[0][0]->i_delta_poc[field]; int delta = curpoc - refpoc; h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta; } h->mb.i_neighbour4[6] = h->mb.i_neighbour4[9] = h->mb.i_neighbour4[12] = h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT; h->mb.i_neighbour4[3] = h->mb.i_neighbour4[7] = h->mb.i_neighbour4[11] = h->mb.i_neighbour4[13] = h->mb.i_neighbour4[15] = h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT; } void x264_macroblock_thread_init( x264_t *h ) { h->mb.i_me_method = h->param.analyse.i_me_method; h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine; if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) ) h->mb.i_subpel_refine--; h->mb.b_chroma_me = h->param.analyse.b_chroma_me && ((h->sh.i_type == SLICE_TYPE_P && h->mb.i_subpel_refine >= 5) || (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9)); h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I); h->mb.i_mb_prev_xy = -1; /* 4:2:0 4:2:2 4:4:4 * fdec fenc fdec fenc fdec fenc * y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y * y Y Y Y Y U U V V y Y Y Y Y U U V V y Y Y Y Y U U U U * u u u v v v U U V V u u u v v v U U V V u u u u u u u U U U U * u U U v V V u U U v V V U U V V u U U U U U U U U * u U U v V V u U U v V V U U V V u U U U U U U U U * u U U v V V u U U U U V V V V * u U U v V V u U U U U V V V V * v v v v v v v V V V V * v V V V V V V V V * v V V V V * v V V V V * v V V V V */ h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; if( CHROMA_FORMAT ) { h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE; if( CHROMA444 ) { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE; h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE; } else { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16; } } } void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) { int stride_y = fenc->i_stride[0]; int stride_uv = fenc->i_stride[1]; int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y; int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> CHROMA_V_SHIFT); h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, fenc->plane[1] != NULL ? fenc->plane[1]+off_uv : NULL, stride_uv, i_mb_x ); } NOINLINE void x264_copy_column8( pixel *dst, pixel *src ) { // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86) for( int i = -4; i < 4; i++ ) dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE]; } static ALWAYS_INLINE void macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff ) { int mb_interlaced = b_mbaff && MB_INTERLACED; int height = b_chroma ? 16 >> CHROMA_V_SHIFT : 16; int i_stride = h->fdec->i_stride[i]; int i_stride2 = i_stride << mb_interlaced; int i_pix_offset = mb_interlaced ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride : 16 * mb_x + height * mb_y * i_stride; pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset]; int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : !(mb_y&1); pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16]; int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; /* ref_pix_offset[0] references the current field and [1] the opposite field. */ if( mb_interlaced ) ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; if( b_chroma ) { h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height ); memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL ); memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL ); h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8]; h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1]; } else { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 ); memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL ); h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1]; } if( b_mbaff || h->mb.b_reencode_mb ) { for( int j = 0; j < height; j++ ) if( b_chroma ) { h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; } else h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; } pixel *plane_src, **filtered_src; for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) { // Interpolate between pixels in same field. if( mb_interlaced ) { plane_src = h->fref[0][j>>1]->plane_fld[i]; filtered_src = h->fref[0][j>>1]->filtered_fld[i]; } else { plane_src = h->fref[0][j]->plane[i]; filtered_src = h->fref[0][j]->filtered[i]; } h->mb.pic.p_fref[0][j][i*4] = plane_src + ref_pix_offset[j&1]; if( !b_chroma ) { if( h->param.analyse.i_subpel_refine ) for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[0][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1]; if( !i ) { if( h->sh.weight[j][0].weightfn ) h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> mb_interlaced][ref_pix_offset[j&1]]; else h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0]; } } } if( h->sh.i_type == SLICE_TYPE_B ) for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) { if( mb_interlaced ) { plane_src = h->fref[1][j>>1]->plane_fld[i]; filtered_src = h->fref[1][j>>1]->filtered_fld[i]; } else { plane_src = h->fref[1][j]->plane[i]; filtered_src = h->fref[1][j]->filtered[i]; } h->mb.pic.p_fref[1][j][i*4] = plane_src + ref_pix_offset[j&1]; if( !b_chroma && h->param.analyse.i_subpel_refine ) for( int k = 1; k < 4; k++ ) h->mb.pic.p_fref[1][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1]; } } static const x264_left_table_t left_indices[4] = { /* Current is progressive */ {{ 4, 4, 5, 5}, { 3, 3, 7, 7}, {16+1, 16+1, 32+1, 32+1}, {0, 0, 1, 1}, {0, 0, 0, 0}}, {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+5, 16+5, 32+5, 32+5}, {2, 2, 3, 3}, {1, 1, 1, 1}}, /* Current is interlaced */ {{ 4, 6, 4, 6}, { 3, 11, 3, 11}, {16+1, 16+1, 32+1, 32+1}, {0, 2, 0, 2}, {0, 1, 0, 1}}, /* Both same */ {{ 4, 5, 6, 3}, { 3, 7, 11, 15}, {16+1, 16+5, 32+1, 32+5}, {0, 1, 2, 3}, {0, 0, 1, 1}} }; static ALWAYS_INLINE void macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y, int b_interlaced ) { const int mb_interlaced = b_interlaced && MB_INTERLACED; int top_y = mb_y - (1 << mb_interlaced); int top = top_y * h->mb.i_mb_stride + mb_x; h->mb.i_mb_x = mb_x; h->mb.i_mb_y = mb_y; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x); h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x); h->mb.left_b8[0] = h->mb.left_b8[1] = -1; h->mb.left_b4[0] = h->mb.left_b4[1] = -1; h->mb.i_neighbour = 0; h->mb.i_neighbour_intra = 0; h->mb.i_neighbour_frame = 0; h->mb.i_mb_top_xy = -1; h->mb.i_mb_top_y = -1; h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1; h->mb.i_mb_topleft_xy = -1; h->mb.i_mb_topright_xy = -1; h->mb.i_mb_type_top = -1; h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1; h->mb.i_mb_type_topleft = -1; h->mb.i_mb_type_topright = -1; h->mb.left_index_table = &left_indices[3]; h->mb.topleft_partition = 0; int topleft_y = top_y; int topright_y = top_y; int left[2]; left[0] = left[1] = h->mb.i_mb_xy - 1; h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2; h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4; if( b_interlaced ) { h->mb.i_mb_top_mbpair_xy = h->mb.i_mb_xy - 2*h->mb.i_mb_stride; h->mb.i_mb_topleft_y = -1; h->mb.i_mb_topright_y = -1; if( mb_y&1 ) { if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) { left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride; h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2*h->mb.i_b8_stride; h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4*h->mb.i_b4_stride; if( mb_interlaced ) { h->mb.left_index_table = &left_indices[2]; left[1] += h->mb.i_mb_stride; h->mb.left_b8[1] += 2*h->mb.i_b8_stride; h->mb.left_b4[1] += 4*h->mb.i_b4_stride; } else { h->mb.left_index_table = &left_indices[1]; topleft_y++; h->mb.topleft_partition = 1; } } if( !mb_interlaced ) topright_y = -1; } else { if( mb_interlaced && top >= 0 ) { if( !h->mb.field[top] ) { top += h->mb.i_mb_stride; top_y++; } if( mb_x ) topleft_y += !h->mb.field[h->mb.i_mb_stride*topleft_y + mb_x - 1]; if( mb_x < h->mb.i_mb_width-1 ) topright_y += !h->mb.field[h->mb.i_mb_stride*topright_y + mb_x + 1]; } if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) { if( mb_interlaced ) { h->mb.left_index_table = &left_indices[2]; left[1] += h->mb.i_mb_stride; h->mb.left_b8[1] += 2*h->mb.i_b8_stride; h->mb.left_b4[1] += 4*h->mb.i_b4_stride; } else h->mb.left_index_table = &left_indices[0]; } } } if( mb_x > 0 ) { h->mb.i_neighbour_frame |= MB_LEFT; h->mb.i_mb_left_xy[0] = left[0]; h->mb.i_mb_left_xy[1] = left[1]; h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]]; h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]]; if( h->mb.slice_table[left[0]] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_LEFT; // FIXME: We don't currently support constrained intra + mbaff. if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) ) h->mb.i_neighbour_intra |= MB_LEFT; } } /* We can't predict from the previous threadslice since it hasn't been encoded yet. */ if( (h->i_threadslice_start >> mb_interlaced) != (mb_y >> mb_interlaced) ) { if( top >= 0 ) { h->mb.i_neighbour_frame |= MB_TOP; h->mb.i_mb_top_xy = top; h->mb.i_mb_top_y = top_y; h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy]; if( h->mb.slice_table[top] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_TOP; if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) ) h->mb.i_neighbour_intra |= MB_TOP; /* We only need to prefetch the top blocks because the left was just written * to as part of the previous cache_save. Since most target CPUs use write-allocate * caches, left blocks are near-guaranteed to be in L1 cache. Top--not so much. */ x264_prefetch( &h->mb.cbp[top] ); x264_prefetch( h->mb.intra4x4_pred_mode[top] ); x264_prefetch( &h->mb.non_zero_count[top][12] ); x264_prefetch( &h->mb.mb_transform_size[top] ); if( h->param.b_cabac ) x264_prefetch( &h->mb.skipbp[top] ); } } if( mb_x > 0 && topleft_y >= 0 ) { h->mb.i_neighbour_frame |= MB_TOPLEFT; h->mb.i_mb_topleft_xy = h->mb.i_mb_stride*topleft_y + mb_x - 1; h->mb.i_mb_topleft_y = topleft_y; h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy]; if( h->mb.slice_table[h->mb.i_mb_topleft_xy] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_TOPLEFT; if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) ) h->mb.i_neighbour_intra |= MB_TOPLEFT; } } if( mb_x < h->mb.i_mb_width - 1 && topright_y >= 0 ) { h->mb.i_neighbour_frame |= MB_TOPRIGHT; h->mb.i_mb_topright_xy = h->mb.i_mb_stride*topright_y + mb_x + 1; h->mb.i_mb_topright_y = topright_y; h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy]; if( h->mb.slice_table[h->mb.i_mb_topright_xy] == h->sh.i_first_mb ) { h->mb.i_neighbour |= MB_TOPRIGHT; if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) ) h->mb.i_neighbour_intra |= MB_TOPRIGHT; } } } } #define LTOP 0 #if HAVE_INTERLACED # define LBOT 1 #else # define LBOT 0 #endif static ALWAYS_INLINE void macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_mbaff ) { macroblock_cache_load_neighbours( h, mb_x, mb_y, b_mbaff ); int *left = h->mb.i_mb_left_xy; int top = h->mb.i_mb_top_xy; int top_y = h->mb.i_mb_top_y; int s8x8 = h->mb.i_b8_stride; int s4x4 = h->mb.i_b4_stride; int top_8x8 = (2*top_y+1) * s8x8 + 2*mb_x; int top_4x4 = (4*top_y+3) * s4x4 + 4*mb_x; int lists = (1 << h->sh.i_type) & 3; /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */ /* By only dereferencing them once, we avoid this issue. */ int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode; uint8_t (*nnz)[48] = h->mb.non_zero_count; int16_t *cbp = h->mb.cbp; const x264_left_table_t *left_index_table = h->mb.left_index_table; h->mb.cache.deblock_strength = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?h->mb.i_mb_xy:mb_x]; /* load cache */ if( h->mb.i_neighbour & MB_TOP ) { h->mb.cache.i_cbp_top = cbp[top]; /* load intra4x4 */ CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] ); /* load non_zero_count */ CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] ); CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>CHROMA_V_SHIFT)] ); CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] ); /* Finish the prefetching */ for( int l = 0; l < lists; l++ ) { x264_prefetch( &h->mb.mv[l][top_4x4-1] ); /* Top right being not in the same cacheline as top left will happen * once every 4 MBs, so one extra prefetch is worthwhile */ x264_prefetch( &h->mb.mv[l][top_4x4+4] ); x264_prefetch( &h->mb.ref[l][top_8x8-1] ); if( h->param.b_cabac ) x264_prefetch( &h->mb.mvd[l][top] ); } } else { h->mb.cache.i_cbp_top = -1; /* load intra4x4 */ M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU; /* load non_zero_count */ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U; M32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8] ) = 0x80808080U; M32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8] ) = 0x80808080U; } if( h->mb.i_neighbour & MB_LEFT ) { int ltop = left[LTOP]; int lbot = b_mbaff ? left[LBOT] : ltop; if( b_mbaff ) { const int16_t top_luma = (cbp[ltop] >> (left_index_table->mv[0]&(~1))) & 2; const int16_t bot_luma = (cbp[lbot] >> (left_index_table->mv[2]&(~1))) & 2; h->mb.cache.i_cbp_left = (cbp[ltop] & 0xfff0) | (bot_luma<<2) | top_luma; } else h->mb.cache.i_cbp_left = cbp[ltop]; /* load intra4x4 */ h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = i4x4[ltop][left_index_table->intra[0]]; h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = i4x4[ltop][left_index_table->intra[1]]; h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = i4x4[lbot][left_index_table->intra[2]]; h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[lbot][left_index_table->intra[3]]; /* load non_zero_count */ h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = nnz[ltop][left_index_table->nnz[0]]; h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = nnz[ltop][left_index_table->nnz[1]]; h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]]; h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]]; if( CHROMA_FORMAT >= CHROMA_422 ) { int offset = (4>>CHROMA_H_SHIFT) - 4; h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset]; h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset]; h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset]; h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset]; h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset]; h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset]; h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset]; h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset]; } else { h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[0]]; h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[1]]; h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[2]]; h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[3]]; } } else { h->mb.cache.i_cbp_left = -1; h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1; /* load non_zero_count */ h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = h->mb.cache.non_zero_count[x264_scan8[10] - 1] = h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80; if( CHROMA_FORMAT >= CHROMA_422 ) { h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = 0x80; } } if( h->pps->b_transform_8x8_mode ) { h->mb.cache.i_neighbour_transform_size = ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] ) + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ); } if( b_mbaff ) { h->mb.pic.i_fref[0] = h->i_ref[0] << MB_INTERLACED; h->mb.pic.i_fref[1] = h->i_ref[1] << MB_INTERLACED; } if( !b_mbaff ) { x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 0 ); if( CHROMA444 ) { x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+ 4*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+12*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+ 4*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+12*FDEC_STRIDE ); macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 0 ); macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 0 ); } else if( CHROMA_FORMAT ) { x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); if( CHROMA_FORMAT == CHROMA_422 ) { x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE ); } macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 ); } } else { macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 1 ); if( CHROMA444 ) { macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 1 ); macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 1 ); } else if( CHROMA_FORMAT ) macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 1 ); } if( h->fdec->integral ) { int offset = 16 * (mb_x + mb_y * h->fdec->i_stride[0]); for( int list = 0; list < 2; list++ ) for( int i = 0; i < h->mb.pic.i_fref[list]; i++ ) h->mb.pic.p_integral[list][i] = &h->fref[list][i]->integral[offset]; } x264_prefetch_fenc( h, h->fenc, mb_x, mb_y ); /* load ref/mv/mvd */ for( int l = 0; l < lists; l++ ) { int16_t (*mv)[2] = h->mb.mv[l]; int8_t *ref = h->mb.ref[l]; int i8 = x264_scan8[0] - 1 - 1*8; if( h->mb.i_neighbour & MB_TOPLEFT ) { int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topleft_y + mb_x-1)+1+s8x8 : top_8x8 - 1; int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topleft_y + mb_x-1)+3+3*s4x4 : top_4x4 - 1; if( b_mbaff && h->mb.topleft_partition ) { /* Take motion vector from the middle of macroblock instead of * the bottom right as usual. */ iv -= 2*s4x4; ir -= s8x8; } h->mb.cache.ref[l][i8] = ref[ir]; CP32( h->mb.cache.mv[l][i8], mv[iv] ); } else { h->mb.cache.ref[l][i8] = -2; M32( h->mb.cache.mv[l][i8] ) = 0; } i8 = x264_scan8[0] - 8; if( h->mb.i_neighbour & MB_TOP ) { h->mb.cache.ref[l][i8+0] = h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0]; h->mb.cache.ref[l][i8+2] = h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1]; CP128( h->mb.cache.mv[l][i8], mv[top_4x4] ); } else { M128( h->mb.cache.mv[l][i8] ) = M128_ZERO; M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U; } i8 = x264_scan8[0] + 4 - 1*8; if( h->mb.i_neighbour & MB_TOPRIGHT ) { int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topright_y + (mb_x+1))+s8x8 : top_8x8 + 2; int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topright_y + (mb_x+1))+3*s4x4 : top_4x4 + 4; h->mb.cache.ref[l][i8] = ref[ir]; CP32( h->mb.cache.mv[l][i8], mv[iv] ); } else h->mb.cache.ref[l][i8] = -2; i8 = x264_scan8[0] - 1; if( h->mb.i_neighbour & MB_LEFT ) { if( b_mbaff ) { h->mb.cache.ref[l][i8+0*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[0]]; h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[1]]; h->mb.cache.ref[l][i8+2*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[2]]; h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[3]]; CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[0]] ); CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[1]] ); CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[2]] ); CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[3]] ); } else { const int ir = h->mb.i_b8_xy - 1; const int iv = h->mb.i_b4_xy - 1; h->mb.cache.ref[l][i8+0*8] = h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8]; h->mb.cache.ref[l][i8+2*8] = h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8]; CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] ); CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] ); CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] ); CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] ); } } else { for( int i = 0; i < 4; i++ ) { h->mb.cache.ref[l][i8+i*8] = -2; M32( h->mb.cache.mv[l][i8+i*8] ) = 0; } } /* Extra logic for top right mv in mbaff. * . . . d . . a . * . . . e . . . . * . . . f b . c . * . . . . . . . . * * If the top right of the 4x4 partitions labeled a, b and c in the * above diagram do not exist, but the entries d, e and f exist (in * the macroblock to the left) then use those instead. */ if( b_mbaff && (h->mb.i_neighbour & MB_LEFT) ) { if( MB_INTERLACED && !h->mb.field[h->mb.i_mb_xy-1] ) { h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*0]; h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*1]; h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x8*0]; CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[0]+1)] ); CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[1]+1)] ); CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4*(left_index_table->mv[2]+1)] ); } else if( !MB_INTERLACED && h->mb.field[h->mb.i_mb_xy-1] ) { // Looking at the bottom field so always take the bottom macroblock of the pair. h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]]; h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]]; h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]]; CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] ); CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] ); CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[2]] ); } } if( h->param.b_cabac ) { uint8_t (*mvd)[8][2] = h->mb.mvd[l]; if( h->mb.i_neighbour & MB_TOP ) CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] ); else M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0; if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1] >= 0) ) { CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[LTOP]][left_index_table->intra[0]] ); CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[LTOP]][left_index_table->intra[1]] ); } else { M16( h->mb.cache.mvd[l][x264_scan8[0]-1+0*8] ) = 0; M16( h->mb.cache.mvd[l][x264_scan8[0]-1+1*8] ) = 0; } if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1+2*8] >= 0) ) { CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[LBOT]][left_index_table->intra[2]] ); CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[LBOT]][left_index_table->intra[3]] ); } else { M16( h->mb.cache.mvd[l][x264_scan8[0]-1+2*8] ) = 0; M16( h->mb.cache.mvd[l][x264_scan8[0]-1+3*8] ) = 0; } } /* If motion vectors are cached from frame macroblocks but this * macroblock is a field macroblock then the motion vector must be * halved. Similarly, motion vectors from field macroblocks are doubled. */ if( b_mbaff ) { #define MAP_MVS\ if( FIELD_DIFFERENT(h->mb.i_mb_topleft_xy) )\ MAP_F2F(mv, ref, x264_scan8[0] - 1 - 1*8)\ if( FIELD_DIFFERENT(top) )\ {\ MAP_F2F(mv, ref, x264_scan8[0] + 0 - 1*8)\ MAP_F2F(mv, ref, x264_scan8[0] + 1 - 1*8)\ MAP_F2F(mv, ref, x264_scan8[0] + 2 - 1*8)\ MAP_F2F(mv, ref, x264_scan8[0] + 3 - 1*8)\ }\ if( FIELD_DIFFERENT(h->mb.i_mb_topright_xy) )\ MAP_F2F(mv, ref, x264_scan8[0] + 4 - 1*8)\ if( FIELD_DIFFERENT(left[0]) )\ {\ MAP_F2F(mv, ref, x264_scan8[0] - 1 + 0*8)\ MAP_F2F(mv, ref, x264_scan8[0] - 1 + 1*8)\ MAP_F2F(mv, ref, x264_scan8[0] - 1 + 2*8)\ MAP_F2F(mv, ref, x264_scan8[0] - 1 + 3*8)\ MAP_F2F(topright_mv, topright_ref, 0)\ MAP_F2F(topright_mv, topright_ref, 1)\ MAP_F2F(topright_mv, topright_ref, 2)\ } if( MB_INTERLACED ) { #define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && !h->mb.field[macroblock]) #define MAP_F2F(varmv, varref, index)\ if( h->mb.cache.varref[l][index] >= 0 )\ {\ h->mb.cache.varref[l][index] <<= 1;\ h->mb.cache.varmv[l][index][1] /= 2;\ h->mb.cache.mvd[l][index][1] >>= 1;\ } MAP_MVS #undef MAP_F2F #undef FIELD_DIFFERENT } else { #define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && h->mb.field[macroblock]) #define MAP_F2F(varmv, varref, index)\ if( h->mb.cache.varref[l][index] >= 0 )\ {\ h->mb.cache.varref[l][index] >>= 1;\ h->mb.cache.varmv[l][index][1] *= 2;\ h->mb.cache.mvd[l][index][1] <<= 1;\ } MAP_MVS #undef MAP_F2F #undef FIELD_DIFFERENT } } } if( b_mbaff && mb_x == 0 && !(mb_y&1) ) { if( h->mb.i_mb_top_xy >= h->sh.i_first_mb ) h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_top_xy]; else h->mb.field_decoding_flag = 0; } /* Check whether skip here would cause decoder to predict interlace mode incorrectly. * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */ h->mb.b_allow_skip = 1; if( b_mbaff ) { if( MB_INTERLACED != h->mb.field_decoding_flag && (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) ) h->mb.b_allow_skip = 0; } if( h->param.b_cabac ) { if( b_mbaff ) { int left_xy, top_xy; /* Neighbours here are calculated based on field_decoding_flag */ int mb_xy = mb_x + (mb_y&~1)*h->mb.i_mb_stride; left_xy = mb_xy - 1; if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] ) left_xy += h->mb.i_mb_stride; if( h->mb.field_decoding_flag ) { top_xy = mb_xy - h->mb.i_mb_stride; if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] ) top_xy -= h->mb.i_mb_stride; } else top_xy = mb_x + (mb_y-1)*h->mb.i_mb_stride; h->mb.cache.i_neighbour_skip = (mb_x > 0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] )) + (top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[top_xy] )); } else { h->mb.cache.i_neighbour_skip = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] )) + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )); } } /* load skip */ if( h->sh.i_type == SLICE_TYPE_B ) { h->mb.bipred_weight = h->mb.bipred_weight_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; if( h->param.b_cabac ) { uint8_t skipbp; x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 ); if( b_mbaff ) { skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LTOP]] : 0; h->mb.cache.skip[x264_scan8[0] - 1] = (skipbp >> (1+(left_index_table->mv[0]&~1))) & 1; skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LBOT]] : 0; h->mb.cache.skip[x264_scan8[8] - 1] = (skipbp >> (1+(left_index_table->mv[2]&~1))) & 1; } else { skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0; h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2; h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8; } skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0; h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4; h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8; } } if( h->sh.i_type == SLICE_TYPE_P ) x264_mb_predict_mv_pskip( h, h->mb.cache.pskip_mv ); h->mb.i_neighbour4[0] = h->mb.i_neighbour8[0] = (h->mb.i_neighbour_intra & (MB_TOP|MB_LEFT|MB_TOPLEFT)) | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOPRIGHT : 0); h->mb.i_neighbour4[4] = h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour_intra & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0); h->mb.i_neighbour4[2] = h->mb.i_neighbour4[8] = h->mb.i_neighbour4[10] = h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour_intra & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0); h->mb.i_neighbour4[5] = h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour_intra & MB_TOPRIGHT) | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0); } void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y ) { macroblock_cache_load( h, mb_x, mb_y, 0 ); } void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y ) { macroblock_cache_load( h, mb_x, mb_y, 1 ); } static void macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][4] ) { if( (h->mb.i_neighbour & MB_LEFT) && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) { static const uint8_t offset[2][2][8] = { { { 0, 0, 0, 0, 1, 1, 1, 1 }, { 2, 2, 2, 2, 3, 3, 3, 3 }, }, { { 0, 1, 2, 3, 0, 1, 2, 3 }, { 0, 1, 2, 3, 0, 1, 2, 3 }, } }; ALIGNED_ARRAY_8( uint8_t, tmpbs, [8] ); const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1]; uint8_t (*nnz)[48] = h->mb.non_zero_count; for( int i = 0; i < 8; i++ ) { int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1]; int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)]; int nnz_left = nnz[left][3 + 4*off[i]]; if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) { int j = off[i]&~1; if( h->mb.mb_transform_size[left] ) nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] )); } tmpbs[i] = (nnz_left || nnz_this) ? 2 : 1; } if( MB_INTERLACED ) { CP32( bs[0][0], &tmpbs[0] ); CP32( bs[0][4], &tmpbs[4] ); } else { for( int i = 0; i < 4; i++ ) bs[0][0][i] = tmpbs[2*i]; for( int i = 0; i < 4; i++ ) bs[0][4][i] = tmpbs[1+2*i]; } } if( (h->mb.i_neighbour & MB_TOP) && MB_INTERLACED != h->mb.field[h->mb.i_mb_top_xy] ) { if( !(h->mb.i_mb_y&1) && !MB_INTERLACED ) { /* Need to filter both fields (even for frame macroblocks). * Filter top two rows using the top macroblock of the above * pair and then the bottom one. */ int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride; uint8_t *nnz_cur = &h->mb.cache.non_zero_count[x264_scan8[0]]; for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) { uint8_t (*nnz)[48] = h->mb.non_zero_count; ALIGNED_4( uint8_t nnz_top[4] ); CP32( nnz_top, &nnz[mbn_xy][3*4] ); if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && h->mb.mb_transform_size[mbn_xy] ) { nnz_top[0] = nnz_top[1] = M16( &nnz[mbn_xy][ 8] ) || M16( &nnz[mbn_xy][12] ); nnz_top[2] = nnz_top[3] = M16( &nnz[mbn_xy][10] ) || M16( &nnz[mbn_xy][14] ); } for( int i = 0; i < 4; i++ ) bs[1][4*j][i] = (nnz_cur[i] || nnz_top[i]) ? 2 : 1; } } else for( int i = 0; i < 4; i++ ) bs[1][0][i] = X264_MAX( bs[1][0][i], 1 ); } } void x264_macroblock_deblock_strength( x264_t *h ) { uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength; if( IS_INTRA( h->mb.i_type ) ) { M32( bs[0][1] ) = 0x03030303; M64( bs[0][2] ) = 0x0303030303030303ULL; M32( bs[1][1] ) = 0x03030303; M64( bs[1][2] ) = 0x0303030303030303ULL; return; } /* Early termination: in this case, nnz guarantees all edges use strength 2.*/ if( h->mb.b_transform_8x8 && !CHROMA444 ) { int cbp_mask = 0xf >> CHROMA_V_SHIFT; if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask ) { M32( bs[0][0] ) = 0x02020202; M32( bs[0][2] ) = 0x02020202; M32( bs[0][4] ) = 0x02020202; M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */ M64( bs[1][2] ) = 0x0202020202020202ULL; M32( bs[1][4] ) = 0x02020202; return; } } int neighbour_changed = 0; if( h->sh.i_disable_deblocking_filter_idc != 2 ) { neighbour_changed = h->mb.i_neighbour_frame&~h->mb.i_neighbour; h->mb.i_neighbour = h->mb.i_neighbour_frame; } /* MBAFF deblock uses different left neighbors from encoding */ if( SLICE_MBAFF && (h->mb.i_neighbour & MB_LEFT) && (h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED) ) { h->mb.i_mb_left_xy[1] = h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1; if( h->mb.i_mb_y&1 ) h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride; else h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride; } /* If we have multiple slices and we're deblocking on slice edges, we * have to reload neighbour data. */ if( neighbour_changed ) { int top_y = h->mb.i_mb_top_y; int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*h->mb.i_mb_x; int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*h->mb.i_mb_x; int s8x8 = h->mb.i_b8_stride; int s4x4 = h->mb.i_b4_stride; uint8_t (*nnz)[48] = h->mb.non_zero_count; const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3]; if( neighbour_changed & MB_TOP ) CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] ); if( neighbour_changed & MB_LEFT ) { int *left = h->mb.i_mb_left_xy; h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]]; h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]]; h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]]; h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]]; } for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) { int16_t (*mv)[2] = h->mb.mv[l]; int8_t *ref = h->mb.ref[l]; int i8 = x264_scan8[0] - 8; if( neighbour_changed & MB_TOP ) { h->mb.cache.ref[l][i8+0] = h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0]; h->mb.cache.ref[l][i8+2] = h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1]; CP128( h->mb.cache.mv[l][i8], mv[top_4x4] ); } i8 = x264_scan8[0] - 1; if( neighbour_changed & MB_LEFT ) { h->mb.cache.ref[l][i8+0*8] = h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]]; h->mb.cache.ref[l][i8+2*8] = h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]]; CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] ); CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] ); CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] ); CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] ); } } } if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P ) { /* Handle reference frame duplicates */ int i8 = x264_scan8[0] - 8; h->mb.cache.ref[0][i8+0] = h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]); h->mb.cache.ref[0][i8+2] = h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]); i8 = x264_scan8[0] - 1; h->mb.cache.ref[0][i8+0*8] = h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]); h->mb.cache.ref[0][i8+2*8] = h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]); int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]); int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]); int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]); int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]); uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101; uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101; M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop; M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop; M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot; M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot; } /* Munge NNZ for cavlc + 8x8dct */ if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) { uint8_t (*nnz)[48] = h->mb.non_zero_count; int top = h->mb.i_mb_top_xy; int *left = h->mb.i_mb_left_xy; if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ) { int i8 = x264_scan8[0] - 8; int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] ); int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] ); M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0; M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0; } if( h->mb.i_neighbour & MB_LEFT ) { int i8 = x264_scan8[0] - 1; if( h->mb.mb_transform_size[left[0]] ) { int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] ); h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0; h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0; } if( h->mb.mb_transform_size[left[1]] ) { int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] ); h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1; h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1; } } if( h->mb.b_transform_8x8 ) { int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] ); int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] ); int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] ); uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101; uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101; M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop; M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop; M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot; M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot; } } h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B ); if( SLICE_MBAFF ) macroblock_deblock_strength_mbaff( h, bs ); } static ALWAYS_INLINE void macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff ) { int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16; int i_stride = h->fdec->i_stride[i]; int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED); int i_pix_offset = (b_mbaff && MB_INTERLACED) ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride : 16 * mb_x + height * mb_y * i_stride; if( b_chroma ) h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height ); else h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 ); } static ALWAYS_INLINE void macroblock_backup_intra( x264_t *h, int mb_x, int mb_y, int b_mbaff ) { /* In MBAFF we store the last two rows in intra_border_backup[0] and [1]. * For progressive mbs this is the bottom two rows, and for interlaced the * bottom row of each field. We also store samples needed for the next * mbpair in intra_border_backup[2]. */ int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2; memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL ); if( CHROMA444 ) { memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL ); memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL ); } else if( CHROMA_FORMAT ) { int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE; memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL ); memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL ); } if( b_mbaff ) { if( mb_y&1 ) { int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE; backup_dst = MB_INTERLACED ? 2 : 0; memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL ); if( CHROMA444 ) { memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL ); memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL ); } else if( CHROMA_FORMAT ) { if( CHROMA_FORMAT == CHROMA_420 ) backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE; memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL ); memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL ); } } } } void x264_macroblock_cache_save( x264_t *h ) { const int i_mb_xy = h->mb.i_mb_xy; const int i_mb_type = x264_mb_type_fix[h->mb.i_type]; const int s8x8 = h->mb.i_b8_stride; const int s4x4 = h->mb.i_b4_stride; const int i_mb_4x4 = h->mb.i_b4_xy; const int i_mb_8x8 = h->mb.i_b8_xy; /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing. */ /* By only dereferencing them once, we avoid this issue. */ int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; uint8_t *nnz = h->mb.non_zero_count[i_mb_xy]; if( SLICE_MBAFF ) { macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 ); macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 1 ); if( CHROMA444 ) { macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 1 ); macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 1 ); } else if( CHROMA_FORMAT ) macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 1 ); } else { macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 ); macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 0 ); if( CHROMA444 ) { macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 0 ); macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 0 ); } else if( CHROMA_FORMAT ) macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 0 ); } x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); h->mb.type[i_mb_xy] = i_mb_type; h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb; h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition; h->mb.i_mb_prev_xy = i_mb_xy; /* save intra4x4 */ if( i_mb_type == I_4x4 ) { CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] ); M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ], h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ], h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0); } else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) ) M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL; else M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL; if( i_mb_type == I_PCM ) { h->mb.qp[i_mb_xy] = 0; h->mb.i_last_dqp = 0; h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2; h->mb.i_cbp_luma = 0xf; h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700; h->mb.b_transform_8x8 = 0; for( int i = 0; i < 48; i++ ) h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16; } else { if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) h->mb.i_qp = h->mb.i_last_qp; h->mb.qp[i_mb_xy] = h->mb.i_qp; h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp; h->mb.i_last_qp = h->mb.i_qp; } /* save non zero count */ CP32( &nnz[ 0+0*4], &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); CP32( &nnz[ 0+1*4], &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); CP32( &nnz[ 0+2*4], &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); CP32( &nnz[ 0+3*4], &h->mb.cache.non_zero_count[x264_scan8[10]] ); CP32( &nnz[16+0*4], &h->mb.cache.non_zero_count[x264_scan8[16+0]] ); CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] ); CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] ); CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] ); if( CHROMA_FORMAT >= CHROMA_422 ) { CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ); CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] ); CP32( &nnz[32+2*4], &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ); CP32( &nnz[32+3*4], &h->mb.cache.non_zero_count[x264_scan8[32+10]] ); } if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 ) h->mb.b_transform_8x8 = 0; h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8; if( h->sh.i_type != SLICE_TYPE_I ) { int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4]; int8_t *ref0 = &h->mb.ref[0][i_mb_8x8]; if( !IS_INTRA( i_mb_type ) ) { ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] ); CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] ); CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] ); CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] ); if( h->sh.i_type == SLICE_TYPE_B ) { int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4]; int8_t *ref1 = &h->mb.ref[1][i_mb_8x8]; ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] ); CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] ); CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] ); CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] ); } } else { M16( &ref0[0*s8x8] ) = (uint8_t)(-1) * 0x0101; M16( &ref0[1*s8x8] ) = (uint8_t)(-1) * 0x0101; M128( &mv0[0*s4x4] ) = M128_ZERO; M128( &mv0[1*s4x4] ) = M128_ZERO; M128( &mv0[2*s4x4] ) = M128_ZERO; M128( &mv0[3*s4x4] ) = M128_ZERO; if( h->sh.i_type == SLICE_TYPE_B ) { int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4]; int8_t *ref1 = &h->mb.ref[1][i_mb_8x8]; M16( &ref1[0*s8x8] ) = (uint8_t)(-1) * 0x0101; M16( &ref1[1*s8x8] ) = (uint8_t)(-1) * 0x0101; M128( &mv1[0*s4x4] ) = M128_ZERO; M128( &mv1[1*s4x4] ) = M128_ZERO; M128( &mv1[2*s4x4] ) = M128_ZERO; M128( &mv1[3*s4x4] ) = M128_ZERO; } } } if( h->param.b_cabac ) { uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy]; if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM ) h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]; else h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC; if( (0x3FF30 >> i_mb_type) & 1 ) /* !INTRA && !SKIP && !DIRECT */ { CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] ); CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] ); CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] ); CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] ); if( h->sh.i_type == SLICE_TYPE_B ) { uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy]; CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] ); CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] ); CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] ); CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] ); } } else { M128( mvd0[0] ) = M128_ZERO; if( h->sh.i_type == SLICE_TYPE_B ) { uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy]; M128( mvd1[0] ) = M128_ZERO; } } if( h->sh.i_type == SLICE_TYPE_B ) { if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT ) h->mb.skipbp[i_mb_xy] = 0xf; else if( i_mb_type == B_8x8 ) { int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0; skipbp |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1; skipbp |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2; skipbp |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3; h->mb.skipbp[i_mb_xy] = skipbp; } else h->mb.skipbp[i_mb_xy] = 0; } } } void x264_macroblock_bipred_init( x264_t *h ) { for( int mbfield = 0; mbfield <= SLICE_MBAFF; mbfield++ ) for( int field = 0; field <= SLICE_MBAFF; field++ ) for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<fref[0][i_ref0>>mbfield]; int poc0 = l0->i_poc + mbfield*l0->i_delta_poc[field^(i_ref0&1)]; for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<fref[1][i_ref1>>mbfield]; int cur_poc = h->fdec->i_poc + mbfield*h->fdec->i_delta_poc[field]; int poc1 = l1->i_poc + mbfield*l1->i_delta_poc[field^(i_ref1&1)]; int td = x264_clip3( poc1 - poc0, -128, 127 ); if( td == 0 /* || pic0 is a long-term ref */ ) { h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = 256; h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32; } else { int tb = x264_clip3( cur_poc - poc0, -128, 127 ); int tx = (16384 + (abs(td) >> 1)) / td; int dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 ); h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor; dist_scale_factor >>= 2; if( h->param.analyse.b_weighted_bipred /* && pic1 is not a long-term ref */ && dist_scale_factor >= -64 && dist_scale_factor <= 128 ) { h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor; // ssse3 implementation of biweight doesn't support the extrema. // if we ever generate them, we'll have to drop that optimization. assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); } else h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32; } } } } x264-master/common/macroblock.h000066400000000000000000000377071502133446700166520ustar00rootroot00000000000000/***************************************************************************** * macroblock.h: macroblock common functions ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MACROBLOCK_H #define X264_MACROBLOCK_H enum macroblock_position_e { MB_LEFT = 0x01, MB_TOP = 0x02, MB_TOPRIGHT = 0x04, MB_TOPLEFT = 0x08, MB_PRIVATE = 0x10, ALL_NEIGHBORS = 0xf, }; static const uint8_t x264_pred_i4x4_neighbors[12] = { MB_TOP, // I_PRED_4x4_V MB_LEFT, // I_PRED_4x4_H MB_LEFT | MB_TOP, // I_PRED_4x4_DC MB_TOP | MB_TOPRIGHT, // I_PRED_4x4_DDL MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_DDR MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_VR MB_LEFT | MB_TOPLEFT | MB_TOP, // I_PRED_4x4_HD MB_TOP | MB_TOPRIGHT, // I_PRED_4x4_VL MB_LEFT, // I_PRED_4x4_HU MB_LEFT, // I_PRED_4x4_DC_LEFT MB_TOP, // I_PRED_4x4_DC_TOP 0 // I_PRED_4x4_DC_128 }; /* XXX mb_type isn't the one written in the bitstream -> only internal usage */ #define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 || (type) == I_PCM ) #define IS_SKIP(type) ( (type) == P_SKIP || (type) == B_SKIP ) #define IS_DIRECT(type) ( (type) == B_DIRECT ) enum mb_class_e { I_4x4 = 0, I_8x8 = 1, I_16x16 = 2, I_PCM = 3, P_L0 = 4, P_8x8 = 5, P_SKIP = 6, B_DIRECT = 7, B_L0_L0 = 8, B_L0_L1 = 9, B_L0_BI = 10, B_L1_L0 = 11, B_L1_L1 = 12, B_L1_BI = 13, B_BI_L0 = 14, B_BI_L1 = 15, B_BI_BI = 16, B_8x8 = 17, B_SKIP = 18, X264_MBTYPE_MAX = 19 }; static const uint8_t x264_mb_type_fix[X264_MBTYPE_MAX] = { I_4x4, I_4x4, I_16x16, I_PCM, P_L0, P_8x8, P_SKIP, B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1, B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP }; static const uint8_t x264_mb_type_list_table[X264_MBTYPE_MAX][2][2] = { {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, {{0,0},{0,0}}, /* INTRA */ {{1,1},{0,0}}, /* P_L0 */ {{0,0},{0,0}}, /* P_8x8 */ {{1,1},{0,0}}, /* P_SKIP */ {{0,0},{0,0}}, /* B_DIRECT */ {{1,1},{0,0}}, {{1,0},{0,1}}, {{1,1},{0,1}}, /* B_L0_* */ {{0,1},{1,0}}, {{0,0},{1,1}}, {{0,1},{1,1}}, /* B_L1_* */ {{1,1},{1,0}}, {{1,0},{1,1}}, {{1,1},{1,1}}, /* B_BI_* */ {{0,0},{0,0}}, /* B_8x8 */ {{0,0},{0,0}} /* B_SKIP */ }; #define IS_SUB4x4(type) ( (type == D_L0_4x4)||(type == D_L1_4x4)||(type == D_BI_4x4) ) #define IS_SUB4x8(type) ( (type == D_L0_4x8)||(type == D_L1_4x8)||(type == D_BI_4x8) ) #define IS_SUB8x4(type) ( (type == D_L0_8x4)||(type == D_L1_8x4)||(type == D_BI_8x4) ) #define IS_SUB8x8(type) ( (type == D_L0_8x8)||(type == D_L1_8x8)||(type == D_BI_8x8)||(type == D_DIRECT_8x8) ) enum mb_partition_e { /* sub partition type for P_8x8 and B_8x8 */ D_L0_4x4 = 0, D_L0_8x4 = 1, D_L0_4x8 = 2, D_L0_8x8 = 3, /* sub partition type for B_8x8 only */ D_L1_4x4 = 4, D_L1_8x4 = 5, D_L1_4x8 = 6, D_L1_8x8 = 7, D_BI_4x4 = 8, D_BI_8x4 = 9, D_BI_4x8 = 10, D_BI_8x8 = 11, D_DIRECT_8x8 = 12, /* partition */ D_8x8 = 13, D_16x8 = 14, D_8x16 = 15, D_16x16 = 16, X264_PARTTYPE_MAX = 17, }; static const uint8_t x264_mb_partition_listX_table[2][17] = {{ 1, 1, 1, 1, /* D_L0_* */ 0, 0, 0, 0, /* D_L1_* */ 1, 1, 1, 1, /* D_BI_* */ 0, /* D_DIRECT_8x8 */ 0, 0, 0, 0 /* 8x8 .. 16x16 */ }, { 0, 0, 0, 0, /* D_L0_* */ 1, 1, 1, 1, /* D_L1_* */ 1, 1, 1, 1, /* D_BI_* */ 0, /* D_DIRECT_8x8 */ 0, 0, 0, 0 /* 8x8 .. 16x16 */ }}; static const uint8_t x264_mb_partition_count_table[17] = { /* sub L0 */ 4, 2, 2, 1, /* sub L1 */ 4, 2, 2, 1, /* sub BI */ 4, 2, 2, 1, /* Direct */ 1, /* Partition */ 4, 2, 2, 1 }; static const uint8_t x264_mb_partition_pixel_table[17] = { PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_L0_* */ PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_L1_* */ PIXEL_4x4, PIXEL_8x4, PIXEL_4x8, PIXEL_8x8, /* D_BI_* */ PIXEL_8x8, /* D_DIRECT_8x8 */ PIXEL_8x8, PIXEL_16x8, PIXEL_8x16, PIXEL_16x16, /* 8x8 .. 16x16 */ }; /* zigzags are transposed with respect to the tables in the standard */ static const uint8_t x264_zigzag_scan4[2][16] = {{ // frame 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15 }, { // field 0, 1, 4, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }}; static const uint8_t x264_zigzag_scan8[2][64] = {{ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63 }, { 0, 1, 2, 8, 9, 3, 4, 10, 16, 11, 5, 6, 7, 12, 17, 24, 18, 13, 14, 15, 19, 25, 32, 26, 20, 21, 22, 23, 27, 33, 40, 34, 28, 29, 30, 31, 35, 41, 48, 42, 36, 37, 38, 39, 43, 49, 50, 44, 45, 46, 47, 51, 56, 57, 52, 53, 54, 55, 58, 59, 60, 61, 62, 63 }}; static const uint8_t block_idx_x[16] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3 }; static const uint8_t block_idx_y[16] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3 }; static const uint8_t block_idx_xy[4][4] = { { 0, 2, 8, 10 }, { 1, 3, 9, 11 }, { 4, 6, 12, 14 }, { 5, 7, 13, 15 } }; static const uint8_t block_idx_xy_1d[16] = { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 }; static const uint8_t block_idx_yx_1d[16] = { 0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15 }; static const uint8_t block_idx_xy_fenc[16] = { 0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE, 0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE, 2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE, 2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE, 0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE, 0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE, 2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE, 2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE }; static const uint16_t block_idx_xy_fdec[16] = { 0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE, 0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE, 2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE, 2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE, 0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE, 0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE, 2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE, 2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE }; #define QP(qP) ( (qP)+QP_BD_OFFSET ) static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #if BIT_DEPTH > 9 QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7), #endif #if BIT_DEPTH > 8 QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1), #endif QP(0), QP(1), QP(2), QP(3), QP(4), QP(5), QP(6), QP(7), QP(8), QP(9), QP(10), QP(11), QP(12), QP(13), QP(14), QP(15), QP(16), QP(17), QP(18), QP(19), QP(20), QP(21), QP(22), QP(23), QP(24), QP(25), QP(26), QP(27), QP(28), QP(29), QP(29), QP(30), QP(31), QP(32), QP(32), QP(33), QP(34), QP(34), QP(35), QP(35), QP(36), QP(36), QP(37), QP(37), QP(37), QP(38), QP(38), QP(38), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), }; #undef QP enum cabac_ctx_block_cat_e { DCT_LUMA_DC = 0, DCT_LUMA_AC = 1, DCT_LUMA_4x4 = 2, DCT_CHROMA_DC = 3, DCT_CHROMA_AC = 4, DCT_LUMA_8x8 = 5, DCT_CHROMAU_DC = 6, DCT_CHROMAU_AC = 7, DCT_CHROMAU_4x4 = 8, DCT_CHROMAU_8x8 = 9, DCT_CHROMAV_DC = 10, DCT_CHROMAV_AC = 11, DCT_CHROMAV_4x4 = 12, DCT_CHROMAV_8x8 = 13, }; static const uint8_t ctx_cat_plane[6][3] = { { DCT_LUMA_DC, DCT_CHROMAU_DC, DCT_CHROMAV_DC}, { DCT_LUMA_AC, DCT_CHROMAU_AC, DCT_CHROMAV_AC}, {DCT_LUMA_4x4, DCT_CHROMAU_4x4, DCT_CHROMAV_4x4}, {0}, {0}, {DCT_LUMA_8x8, DCT_CHROMAU_8x8, DCT_CHROMAV_8x8} }; /* Per-frame allocation: is allocated per-thread only in frame-threads mode. */ #define x264_macroblock_cache_allocate x264_template(macroblock_cache_allocate) int x264_macroblock_cache_allocate( x264_t *h ); #define x264_macroblock_cache_free x264_template(macroblock_cache_free) void x264_macroblock_cache_free( x264_t *h ); /* Per-thread allocation: is allocated per-thread even in sliced-threads mode. */ #define x264_macroblock_thread_allocate x264_template(macroblock_thread_allocate) int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ); #define x264_macroblock_thread_free x264_template(macroblock_thread_free) void x264_macroblock_thread_free( x264_t *h, int b_lookahead ); #define x264_macroblock_slice_init x264_template(macroblock_slice_init) void x264_macroblock_slice_init( x264_t *h ); #define x264_macroblock_thread_init x264_template(macroblock_thread_init) void x264_macroblock_thread_init( x264_t *h ); #define x264_macroblock_cache_load_interlaced x264_template(macroblock_cache_load_interlaced) void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y ); #define x264_macroblock_cache_load_progressive x264_template(macroblock_cache_load_progressive) void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y ); #define x264_macroblock_deblock_strength x264_template(macroblock_deblock_strength) void x264_macroblock_deblock_strength( x264_t *h ); #define x264_macroblock_cache_save x264_template(macroblock_cache_save) void x264_macroblock_cache_save( x264_t *h ); #define x264_macroblock_bipred_init x264_template(macroblock_bipred_init) void x264_macroblock_bipred_init( x264_t *h ); #define x264_prefetch_fenc x264_template(prefetch_fenc) void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ); #define x264_copy_column8 x264_template(copy_column8) void x264_copy_column8( pixel *dst, pixel *src ); /* x264_mb_predict_mv_16x16: * set mvp with predicted mv for D_16x16 block * h->mb. need only valid values from other blocks */ #define x264_mb_predict_mv_16x16 x264_template(mb_predict_mv_16x16) void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] ); /* x264_mb_predict_mv_pskip: * set mvp with predicted mv for P_SKIP * h->mb. need only valid values from other blocks */ #define x264_mb_predict_mv_pskip x264_template(mb_predict_mv_pskip) void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ); /* x264_mb_predict_mv: * set mvp with predicted mv for all blocks except SKIP and DIRECT * h->mb. need valid ref/partition/sub of current block to be valid * and valid mv/ref from other blocks. */ #define x264_mb_predict_mv x264_template(mb_predict_mv) void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ); /* x264_mb_predict_mv_direct16x16: * set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT * h->mb. need only valid values from other blocks. * return 1 on success, 0 on failure. * if b_changed != NULL, set it to whether refs or mvs differ from * before this functioncall. */ #define x264_mb_predict_mv_direct16x16 x264_template(mb_predict_mv_direct16x16) int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed ); /* x264_mb_predict_mv_ref16x16: * set mvc with D_16x16 prediction. * uses all neighbors, even those that didn't end up using this ref. * h->mb. need only valid values from other blocks */ #define x264_mb_predict_mv_ref16x16 x264_template(mb_predict_mv_ref16x16) void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc ); #define x264_mb_mc x264_template(mb_mc) void x264_mb_mc( x264_t *h ); #define x264_mb_mc_8x8 x264_template(mb_mc_8x8) void x264_mb_mc_8x8( x264_t *h, int i8 ); static ALWAYS_INLINE uint32_t pack16to32( uint32_t a, uint32_t b ) { #if WORDS_BIGENDIAN return b + (a<<16); #else return a + (b<<16); #endif } static ALWAYS_INLINE uint32_t pack8to16( uint32_t a, uint32_t b ) { #if WORDS_BIGENDIAN return b + (a<<8); #else return a + (b<<8); #endif } static ALWAYS_INLINE uint32_t pack8to32( uint32_t a, uint32_t b, uint32_t c, uint32_t d ) { #if WORDS_BIGENDIAN return d + (c<<8) + (b<<16) + (a<<24); #else return a + (b<<8) + (c<<16) + (d<<24); #endif } static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b ) { #if WORDS_BIGENDIAN return (b&0xFFFF) + ((uint32_t)a<<16); #else return (a&0xFFFF) + ((uint32_t)b<<16); #endif } static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b ) { #if WORDS_BIGENDIAN return b + ((uint64_t)a<<32); #else return a + ((uint64_t)b<<32); #endif } #if HIGH_BIT_DEPTH # define pack_pixel_1to2 pack16to32 # define pack_pixel_2to4 pack32to64 #else # define pack_pixel_1to2 pack8to16 # define pack_pixel_2to4 pack16to32 #endif static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ) { const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8]; const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma), x264_mb_pred_mode4x4_fix(mb) ); if( m < 0 ) return I_PRED_4x4_DC; return m; } static ALWAYS_INLINE int x264_mb_predict_non_zero_code( x264_t *h, int idx ) { const int za = h->mb.cache.non_zero_count[x264_scan8[idx] - 1]; const int zb = h->mb.cache.non_zero_count[x264_scan8[idx] - 8]; int i_ret = za + zb; if( i_ret < 0x80 ) i_ret = ( i_ret + 1 ) >> 1; return i_ret & 0x7f; } /* intra and skip are disallowed, p8x8 is conditional. */ static const uint8_t x264_transform_allowed[X264_MBTYPE_MAX] = { 0,0,0,0,1,2,0,1,1,1,1,1,1,1,1,1,1,1,0 }; /* x264_mb_transform_8x8_allowed: * check whether any partition is smaller than 8x8 (or at least * might be, according to just partition type.) * doesn't check for cbp */ static ALWAYS_INLINE int x264_mb_transform_8x8_allowed( x264_t *h ) { if( !h->pps->b_transform_8x8_mode ) return 0; if( h->mb.i_type != P_8x8 ) return x264_transform_allowed[h->mb.i_type]; return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101; } #endif x264-master/common/mc.c000066400000000000000000000651151502133446700151220ustar00rootroot00000000000000/***************************************************************************** * mc.c: motion compensation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #if HAVE_MMX #include "x86/mc.h" #endif #if HAVE_ALTIVEC #include "ppc/mc.h" #endif #if HAVE_ARMV6 #include "arm/mc.h" #endif #if HAVE_AARCH64 #include "aarch64/mc.h" #endif #if HAVE_MSA #include "mips/mc.h" #endif #if HAVE_LSX # include "loongarch/mc.h" #endif static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, pixel *src1, intptr_t i_src1_stride, pixel *src2, intptr_t i_src2_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; dst += i_dst_stride; src1 += i_src1_stride; src2 += i_src2_stride; } } static inline void pixel_avg_wxh( pixel *dst, intptr_t i_dst, pixel *src1, intptr_t i_src1, pixel *src2, intptr_t i_src2, int width, int height ) { for( int y = 0; y < height; y++ ) { for( int x = 0; x < width; x++ ) dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; src1 += i_src1; src2 += i_src2; dst += i_dst; } } /* Implicit weighted bipred only: * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */ static inline void pixel_avg_weight_wxh( pixel *dst, intptr_t i_dst, pixel *src1, intptr_t i_src1, pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 ) { int i_weight2 = 64 - i_weight1; for( int y = 0; y> 6 ); } #undef op_scale2 #define PIXEL_AVG_C( name, width, height ) \ static void name( pixel *pix1, intptr_t i_stride_pix1, \ pixel *pix2, intptr_t i_stride_pix2, \ pixel *pix3, intptr_t i_stride_pix3, int weight ) \ { \ if( weight == 32 ) \ pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ else \ pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \ } PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) PIXEL_AVG_C( pixel_avg_8x16, 8, 16 ) PIXEL_AVG_C( pixel_avg_8x8, 8, 8 ) PIXEL_AVG_C( pixel_avg_8x4, 8, 4 ) PIXEL_AVG_C( pixel_avg_4x16, 4, 16 ) PIXEL_AVG_C( pixel_avg_4x8, 4, 8 ) PIXEL_AVG_C( pixel_avg_4x4, 4, 4 ) PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) PIXEL_AVG_C( pixel_avg_2x8, 2, 8 ) PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) static void weight_cache( x264_t *h, x264_weight_t *w ) { w->weightfn = h->mc.weight; } #define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset ) #define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset ) static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int i_width, int i_height ) { int offset = weight->i_offset * (1 << (BIT_DEPTH-8)); int scale = weight->i_scale; int denom = weight->i_denom; if( denom >= 1 ) { for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride ) for( int x = 0; x < i_width; x++ ) opscale( x ); } else { for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride ) for( int x = 0; x < i_width; x++ ) opscale_noden( x ); } } #define MC_WEIGHT_C( name, width ) \ static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \ { \ mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\ } MC_WEIGHT_C( mc_weight_w20, 20 ) MC_WEIGHT_C( mc_weight_w16, 16 ) MC_WEIGHT_C( mc_weight_w12, 12 ) MC_WEIGHT_C( mc_weight_w8, 8 ) MC_WEIGHT_C( mc_weight_w4, 4 ) MC_WEIGHT_C( mc_weight_w2, 2 ) static weight_fn_t mc_weight_wtab[6] = { mc_weight_w2, mc_weight_w4, mc_weight_w8, mc_weight_w12, mc_weight_w16, mc_weight_w20, }; static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { memcpy( dst, src, i_width * SIZEOF_PIXEL ); src += i_src_stride; dst += i_dst_stride; } } #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d])) static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, intptr_t stride, int width, int height, int16_t *buf ) { const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0; for( int y = 0; y < height; y++ ) { for( int x = -2; x < width+3; x++ ) { int v = TAPFILTER(src,stride); dstv[x] = x264_clip_pixel( (v + 16) >> 5 ); /* transform v for storage in a 16-bit integer */ buf[x+2] = v + pad; } for( int x = 0; x < width; x++ ) dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) - 32*pad + 512) >> 10 ); for( int x = 0; x < width; x++ ) dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 ); dsth += stride; dstv += stride; dstc += stride; src += stride; } } static void mc_luma( pixel *dst, intptr_t i_dst_stride, pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); int offset = (mvy>>2)*i_src_stride + (mvx>>2); pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg( dst, i_dst_stride, src1, i_src_stride, src2, i_src_stride, i_width, i_height ); if( weight->weightfn ) mc_weight( dst, i_dst_stride, dst, i_dst_stride, weight, i_width, i_height ); } else if( weight->weightfn ) mc_weight( dst, i_dst_stride, src1, i_src_stride, weight, i_width, i_height ); else mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height ); } static pixel *get_ref( pixel *dst, intptr_t *i_dst_stride, pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); int offset = (mvy>>2)*i_src_stride + (mvx>>2); pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg( dst, *i_dst_stride, src1, i_src_stride, src2, i_src_stride, i_width, i_height ); if( weight->weightfn ) mc_weight( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_width, i_height ); return dst; } else if( weight->weightfn ) { mc_weight( dst, *i_dst_stride, src1, i_src_stride, weight, i_width, i_height ); return dst; } else { *i_dst_stride = i_src_stride; return src1; } } /* full chroma mc (ie until 1/8 pixel)*/ static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height ) { pixel *srcp; int d8x = mvx&0x07; int d8y = mvy&0x07; int cA = (8-d8x)*(8-d8y); int cB = d8x *(8-d8y); int cC = (8-d8x)*d8y; int cD = d8x *d8y; src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; for( int y = 0; y < i_height; y++ ) { for( int x = 0; x < i_width; x++ ) { dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] + cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6; dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] + cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6; } dstu += i_dst_stride; dstv += i_dst_stride; src = srcp; srcp += i_src_stride; } } #define MC_COPY(W) \ static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \ { \ mc_copy( src, i_src, dst, i_dst, W, i_height ); \ } MC_COPY( 16 ) MC_COPY( 8 ) MC_COPY( 4 ) void x264_plane_copy_c( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ) { while( h-- ) { memcpy( dst, src, w * SIZEOF_PIXEL ); dst += i_dst; src += i_src; } } void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ) { for( int y=0; y>8)&0xff00) + (x>>24); } #else #define v210_endian_fix32(x) (x) #endif static void plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty, pixel *dstc, intptr_t i_dstc, uint32_t *src, intptr_t i_src, int w, int h ) { for( int l = 0; l < h; l++ ) { pixel *dsty0 = dsty; pixel *dstc0 = dstc; uint32_t *src0 = src; for( int n = 0; n < w; n += 3 ) { uint32_t s = v210_endian_fix32( *src0++ ); *dstc0++ = s & 0x03FF; *dsty0++ = (s >> 10) & 0x03FF; *dstc0++ = (s >> 20) & 0x03FF; s = v210_endian_fix32( *src0++ ); *dsty0++ = s & 0x03FF; *dstc0++ = (s >> 10) & 0x03FF; *dsty0++ = (s >> 20) & 0x03FF; } dsty += i_dsty; dstc += i_dstc; src += i_src; } } static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ) { for( int y=0; yplane[0]; int i_stride = frame->i_stride[0]; int i_height = frame->i_lines[0]; int i_width = frame->i_width[0]; // duplicate last row and column so that their interpolation doesn't have to be special-cased for( int y = 0; y < i_height; y++ ) src[i_width+y*i_stride] = src[i_width-1+y*i_stride]; memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * SIZEOF_PIXEL ); h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3], i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres ); x264_frame_expand_border_lowres( frame ); memset( frame->i_cost_est, -1, sizeof(frame->i_cost_est) ); for( int y = 0; y < h->param.i_bframe + 2; y++ ) for( int x = 0; x < h->param.i_bframe + 2; x++ ) frame->i_row_satds[y][x][0] = -1; for( int y = 0; y <= !!h->param.i_bframe; y++ ) for( int x = 0; x <= h->param.i_bframe; x++ ) frame->lowres_mvs[y][x][0][0] = 0x7FFF; } static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc, intptr_t src_stride, intptr_t dst_stride, int width, int height ) { for( int y = 0; y < height; y++ ) { pixel *src1 = src0+src_stride; pixel *src2 = src1+src_stride; for( int x = 0; x>1)+((c+d+1)>>1)+1)>>1) dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]); dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]); dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]); dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]); #undef FILTER } src0 += src_stride*2; dst0 += dst_stride; dsth += dst_stride; dstv += dst_stride; dstc += dst_stride; } } /* Estimate the total amount of influence on future quality that could be had if we * were to improve the reference samples used to inter predict any given macroblock. */ static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) { float fps = *fps_factor; for( int i = 0; i < len; i++ ) { int intra_cost = intra_costs[i]; int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK); float propagate_intra = intra_cost * inv_qscales[i]; float propagate_amount = propagate_in[i] + propagate_intra*fps; float propagate_num = intra_cost - inter_cost; float propagate_denom = intra_cost; dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767); } } static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, int bipred_weight, int mb_y, int len, int list ) { unsigned stride = h->mb.i_mb_stride; unsigned width = h->mb.i_mb_width; unsigned height = h->mb.i_mb_height; for( int i = 0; i < len; i++ ) { int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT; if( !(lists_used & (1 << list)) ) continue; int listamount = propagate_amount[i]; /* Apply bipred weighting. */ if( lists_used == 3 ) listamount = (listamount * bipred_weight + 32) >> 6; /* Early termination for simple case of mv0. */ if( !M32( mvs[i] ) ) { MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount ); continue; } int x = mvs[i][0]; int y = mvs[i][1]; unsigned mbx = (unsigned)((x>>5)+i); unsigned mby = (unsigned)((y>>5)+mb_y); unsigned idx0 = mbx + mby * stride; unsigned idx2 = idx0 + stride; x &= 31; y &= 31; int idx0weight = (32-y)*(32-x); int idx1weight = (32-y)*x; int idx2weight = y*(32-x); int idx3weight = y*x; idx0weight = (idx0weight * listamount + 512) >> 10; idx1weight = (idx1weight * listamount + 512) >> 10; idx2weight = (idx2weight * listamount + 512) >> 10; idx3weight = (idx3weight * listamount + 512) >> 10; if( mbx < width-1 && mby < height-1 ) { MC_CLIP_ADD( ref_costs[idx0+0], idx0weight ); MC_CLIP_ADD( ref_costs[idx0+1], idx1weight ); MC_CLIP_ADD( ref_costs[idx2+0], idx2weight ); MC_CLIP_ADD( ref_costs[idx2+1], idx3weight ); } else { /* Note: this takes advantage of unsigned representation to * catch negative mbx/mby. */ if( mby < height ) { if( mbx < width ) MC_CLIP_ADD( ref_costs[idx0+0], idx0weight ); if( mbx+1 < width ) MC_CLIP_ADD( ref_costs[idx0+1], idx1weight ); } if( mby+1 < height ) { if( mbx < width ) MC_CLIP_ADD( ref_costs[idx2+0], idx2weight ); if( mbx+1 < width ) MC_CLIP_ADD( ref_costs[idx2+1], idx3weight ); } } } } /* Conversion between float and Q8.8 fixed point (big-endian) for storage */ static void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) { for( int i = 0; i < count; i++ ) dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) ); } static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count ) { for( int i = 0; i < count; i++ ) dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f); } void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent ) { pf->mc_luma = mc_luma; pf->get_ref = get_ref; pf->mc_chroma = mc_chroma; pf->avg[PIXEL_16x16]= pixel_avg_16x16; pf->avg[PIXEL_16x8] = pixel_avg_16x8; pf->avg[PIXEL_8x16] = pixel_avg_8x16; pf->avg[PIXEL_8x8] = pixel_avg_8x8; pf->avg[PIXEL_8x4] = pixel_avg_8x4; pf->avg[PIXEL_4x16] = pixel_avg_4x16; pf->avg[PIXEL_4x8] = pixel_avg_4x8; pf->avg[PIXEL_4x4] = pixel_avg_4x4; pf->avg[PIXEL_4x2] = pixel_avg_4x2; pf->avg[PIXEL_2x8] = pixel_avg_2x8; pf->avg[PIXEL_2x4] = pixel_avg_2x4; pf->avg[PIXEL_2x2] = pixel_avg_2x2; pf->weight = mc_weight_wtab; pf->offsetadd = mc_weight_wtab; pf->offsetsub = mc_weight_wtab; pf->weight_cache = weight_cache; pf->copy_16x16_unaligned = mc_copy_w16; pf->copy[PIXEL_16x16] = mc_copy_w16; pf->copy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4; pf->store_interleave_chroma = store_interleave_chroma; pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc; pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec; pf->plane_copy = x264_plane_copy_c; pf->plane_copy_swap = x264_plane_copy_swap_c; pf->plane_copy_interleave = x264_plane_copy_interleave_c; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_c; pf->plane_copy_deinterleave_v210 = plane_copy_deinterleave_v210_c; pf->hpel_filter = hpel_filter; pf->prefetch_fenc_400 = prefetch_fenc_null; pf->prefetch_fenc_420 = prefetch_fenc_null; pf->prefetch_fenc_422 = prefetch_fenc_null; pf->prefetch_ref = prefetch_ref_null; pf->memcpy_aligned = memcpy; pf->memzero_aligned = memzero_aligned; pf->frame_init_lowres_core = frame_init_lowres_core; pf->integral_init4h = integral_init4h; pf->integral_init8h = integral_init8h; pf->integral_init4v = integral_init4v; pf->integral_init8v = integral_init8v; pf->mbtree_propagate_cost = mbtree_propagate_cost; pf->mbtree_propagate_list = mbtree_propagate_list; pf->mbtree_fix8_pack = mbtree_fix8_pack; pf->mbtree_fix8_unpack = mbtree_fix8_unpack; #if HAVE_MMX x264_mc_init_mmx( cpu, pf ); #endif #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) x264_mc_init_altivec( pf ); #endif #if HAVE_ARMV6 x264_mc_init_arm( cpu, pf ); #endif #if HAVE_AARCH64 x264_mc_init_aarch64( cpu, pf ); #endif #if HAVE_MSA if( cpu&X264_CPU_MSA ) x264_mc_init_mips( cpu, pf ); #endif #if HAVE_LSX x264_mc_init_loongarch( cpu, pf ); #endif if( cpu_independent ) { pf->mbtree_propagate_cost = mbtree_propagate_cost; pf->mbtree_propagate_list = mbtree_propagate_list; } } void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) { const int b_interlaced = PARAM_INTERLACED; int start = mb_y*16 - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 int height = (b_end ? frame->i_lines[0] + 16*PARAM_INTERLACED : (mb_y+b_interlaced)*16) + 8; if( mb_y & b_interlaced ) return; for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) { int stride = frame->i_stride[p]; const int width = frame->i_width[p]; int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd if( !b_interlaced || h->mb.b_adaptive_mbaff ) h->mc.hpel_filter( frame->filtered[p][1] + offs, frame->filtered[p][2] + offs, frame->filtered[p][3] + offs, frame->plane[p] + offs, stride, width + 16, height - start, h->scratch_buffer ); if( b_interlaced ) { /* MC must happen between pixels in the same field. */ stride = frame->i_stride[p] << 1; start = (mb_y*16 >> 1) - 8; int height_fld = ((b_end ? frame->i_lines[p] : mb_y*16) >> 1) + 8; offs = start*stride - 8; for( int i = 0; i < 2; i++, offs += frame->i_stride[p] ) { h->mc.hpel_filter( frame->filtered_fld[p][1] + offs, frame->filtered_fld[p][2] + offs, frame->filtered_fld[p][3] + offs, frame->plane_fld[p] + offs, stride, width + 16, height_fld - start, h->scratch_buffer ); } } } /* generate integral image: * frame->integral contains 2 planes. in the upper plane, each element is * the sum of an 8x8 pixel region with top-left corner on that point. * in the lower plane, 4x4 sums (needed only with --partitions p4x4). */ if( frame->integral ) { int stride = frame->i_stride[0]; if( start < 0 ) { memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) ); start = -PADV; } if( b_end ) height += PADV-9; for( int y = start; y < height; y++ ) { pixel *pix = frame->plane[0] + y * stride - PADH_ALIGN; uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN; uint16_t *sum4; if( h->frames.b_have_sub8x8_esa ) { h->mc.integral_init4h( sum8, pix, stride ); sum8 -= 8*stride; sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2); if( y >= 8-PADV ) h->mc.integral_init4v( sum8, sum4, stride ); } else { h->mc.integral_init8h( sum8, pix, stride ); if( y >= 8-PADV ) h->mc.integral_init8v( sum8-8*stride, stride ); } } } } x264-master/common/mc.h000066400000000000000000000360661502133446700151320ustar00rootroot00000000000000/***************************************************************************** * mc.h: motion compensation ***************************************************************************** * Copyright (C) 2004-2025 x264 project * * Authors: Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MC_H #define X264_MC_H #define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) #define MC_CLIP_ADD2(s,x)\ do\ {\ MC_CLIP_ADD((s)[0], (x)[0]);\ MC_CLIP_ADD((s)[1], (x)[1]);\ } while( 0 ) #define x264_mbtree_propagate_list_internal_neon x264_template(mbtree_propagate_list_internal_neon) #define PROPAGATE_LIST(cpu)\ void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\ uint16_t *lowres_costs, int16_t *output,\ int bipred_weight, int mb_y, int len );\ \ static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\ int16_t *propagate_amount, uint16_t *lowres_costs,\ int bipred_weight, int mb_y, int len, int list )\ {\ int16_t *current = h->scratch_buffer2;\ \ x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\ current, bipred_weight, mb_y, len );\ \ unsigned stride = h->mb.i_mb_stride;\ unsigned width = h->mb.i_mb_width;\ unsigned height = h->mb.i_mb_height;\ \ for( int i = 0; i < len; current += 32 )\ {\ int end = X264_MIN( i+8, len );\ for( ; i < end; i++, current += 2 )\ {\ if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\ continue;\ \ unsigned mbx = (unsigned)current[0];\ unsigned mby = (unsigned)current[1];\ unsigned idx0 = mbx + mby * stride;\ unsigned idx2 = idx0 + stride;\ \ /* Shortcut for the simple/common case of zero MV */\ if( !M32( mvs[i] ) )\ {\ MC_CLIP_ADD( ref_costs[idx0], current[16] );\ continue;\ }\ \ if( mbx < width-1 && mby < height-1 )\ {\ MC_CLIP_ADD2( ref_costs+idx0, current+16 );\ MC_CLIP_ADD2( ref_costs+idx2, current+32 );\ }\ else\ {\ /* Note: this takes advantage of unsigned representation to\ * catch negative mbx/mby. */\ if( mby < height )\ {\ if( mbx < width )\ MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\ if( mbx+1 < width )\ MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\ }\ if( mby+1 < height )\ {\ if( mbx < width )\ MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\ if( mbx+1 < width )\ MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\ }\ }\ }\ }\ } #define x264_plane_copy_c x264_template(plane_copy_c) void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define PLANE_COPY(align, cpu)\ static void plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ {\ int c_w = (align) / SIZEOF_PIXEL - 1;\ if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ else if( !(w&c_w) )\ x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ else\ {\ if( --h > 0 )\ {\ if( i_src > 0 )\ {\ x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ dst += i_dst * h;\ src += i_src * h;\ }\ else\ x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ }\ /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ memcpy( dst, src, w*SIZEOF_PIXEL );\ }\ } #define x264_plane_copy_swap_c x264_template(plane_copy_swap_c) void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define PLANE_COPY_SWAP(align, cpu)\ static void plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ {\ int c_w = (align>>1) / SIZEOF_PIXEL - 1;\ if( !(w&c_w) )\ x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ else if( w > c_w )\ {\ if( --h > 0 )\ {\ if( i_src > 0 )\ {\ x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ dst += i_dst * h;\ src += i_src * h;\ }\ else\ x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ }\ x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ {\ dst[x] = src[x+1];\ dst[x+1] = src[x];\ }\ }\ else\ x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ } #define x264_plane_copy_deinterleave_c x264_template(plane_copy_deinterleave_c) void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); /* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV * input with the additional constraint that we cannot overread src. */ #define PLANE_COPY_YUYV(align, cpu)\ static void plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\ pixel *src, intptr_t i_src, int w, int h )\ {\ int c_w = (align>>1) / SIZEOF_PIXEL - 1;\ if( !(w&c_w) )\ x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ else if( w > c_w )\ {\ if( --h > 0 )\ {\ if( i_src > 0 )\ {\ x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ dsta += i_dsta * h;\ dstb += i_dstb * h;\ src += i_src * h;\ }\ else\ x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\ src+i_src, i_src, w, h );\ }\ x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\ }\ else\ x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ } #define x264_plane_copy_interleave_c x264_template(plane_copy_interleave_c) void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define PLANE_INTERLEAVE(cpu) \ static void plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ pixel *srcu, intptr_t i_srcu,\ pixel *srcv, intptr_t i_srcv, int w, int h )\ {\ int c_w = 16 / SIZEOF_PIXEL - 1;\ if( !(w&c_w) )\ x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\ {\ if( --h > 0 )\ {\ if( i_srcu > 0 )\ {\ x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\ dst += i_dst * h;\ srcu += i_srcu * h;\ srcv += i_srcv * h;\ }\ else\ x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\ }\ x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ }\ else\ x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ } struct x264_weight_t; typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int ); typedef struct x264_weight_t { /* aligning the first member is a gcc hack to force the struct to be * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */ ALIGNED_16( int16_t cachea[8] ); int16_t cacheb[8]; int32_t i_denom; int32_t i_scale; int32_t i_offset; weight_fn_t *weightfn; } ALIGNED_16( x264_weight_t ); #define x264_weight_none ((const x264_weight_t*)x264_zero) #define SET_WEIGHT( w, b, s, d, o )\ {\ (w).i_scale = (s);\ (w).i_denom = (d);\ (w).i_offset = (o);\ if( b )\ h->mc.weight_cache( h, &w );\ else\ w.weightfn = NULL;\ } /* Do the MC * XXX: Only width = 4, 8 or 16 are valid * width == 4 -> height == 4 or 8 * width == 8 -> height == 4 or 8 or 16 * width == 16-> height == 8 or 16 * */ typedef struct { void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ); /* may round up the dimensions if they're not a power of 2 */ pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ); /* mc_chroma may write up to 2 bytes of garbage to the right of dst, * so it must be run from left to right. */ void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src, int mvx, int mvy, int i_width, int i_height ); void (*avg[12])( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, pixel *src2, intptr_t src2_stride, int i_weight ); /* only 16x16, 8x8, and 4x4 defined */ void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height ); void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height ); void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height ); void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height ); void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); /* may write up to 15 pixels off the end of each plane */ void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty, pixel *dstc, intptr_t i_dstc, uint32_t *src, intptr_t i_src, int w, int h ); void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, intptr_t i_stride, int i_width, int i_height, int16_t *buf ); /* prefetch the next few macroblocks of fenc or fdec */ void (*prefetch_fenc) ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x ); void (*prefetch_fenc_400)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x ); void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x ); void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x ); /* prefetch the next few macroblocks of a hpel reference frame */ void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity ); void *(*memcpy_aligned)( void *dst, const void *src, size_t n ); void (*memzero_aligned)( void *dst, size_t n ); /* successive elimination prefilter */ void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride ); void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride ); void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); void (*integral_init8v)( uint16_t *sum8, intptr_t stride ); void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc, intptr_t src_stride, intptr_t dst_stride, int width, int height ); weight_fn_t *weight; weight_fn_t *offsetadd; weight_fn_t *offsetsub; void (*weight_cache)( x264_t *, x264_weight_t * ); void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, int bipred_weight, int mb_y, int len, int list ); void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count ); void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count ); } x264_mc_functions_t; #define x264_mc_init x264_template(mc_init) void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent ); #endif x264-master/common/mips/000077500000000000000000000000001502133446700153175ustar00rootroot00000000000000x264-master/common/mips/dct-c.c000066400000000000000000000511701502133446700164610ustar00rootroot00000000000000/***************************************************************************** * dct-c.c: msa transform and zigzag ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Rishikesh More * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macros.h" #include "dct.h" #if !HIGH_BIT_DEPTH #define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ \ tmp0_m = in0 + in2; \ tmp1_m = in0 - in2; \ tmp2_m = in1 >> 1; \ tmp2_m = tmp2_m - in3; \ tmp3_m = in3 >> 1; \ tmp3_m = in1 + tmp3_m; \ \ BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \ } static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst, int32_t i_src_stride ) { v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3; v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; v4i32 hor_res0, hor_res1, hor_res2, hor_res3; v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, hor_res0, hor_res3, hor_res2, hor_res1 ); TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3, hor_res0, hor_res1, hor_res2, hor_res3 ); BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1, tmp0, tmp3, tmp2, tmp1 ); BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r ); SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 ); PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r, ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r, ver_res0, ver_res1, ver_res2, ver_res3 ); PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 ); ST_SH2( ver_res0, ver_res2, p_dst, 8 ); } static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_dst_stride, int16_t *p_dst ) { uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_ref0, i_ref1, i_ref2, i_ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; v16u8 inp0, inp1; v8i16 diff0, diff1, diff2, diff3; v8i16 temp0, temp1, temp2, temp3; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref ); ILVRL_B2_UB( src, ref, inp0, inp1 ); HSUB_UB2_SH( inp0, inp1, diff0, diff2 ); diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 ); diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 ); BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); diff0 = temp0 + temp1; diff1 = ( temp3 << 1 ) + temp2; diff2 = temp0 - temp1; diff3 = temp3 - ( temp2 << 1 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 ); temp0 = diff0 + diff1; temp1 = ( diff3 << 1 ) + diff2; temp2 = diff0 - diff1; temp3 = diff3 - ( diff2 << 1 ); ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 ); ST_UB2( inp0, inp1, p_dst, 8 ); } static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16], int16_t pi_level[16] ) { v8i16 src0, src1; v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 }; v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 }; LD_SH2( pi_dct, 8, src0, src1 ); VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 ); ST_SH2( mask0, mask1, pi_level, 8 ); } static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3; v8i16 hres0, hres1, hres2, hres3; v8i16 vres0, vres1, vres2, vres3; v8i16 zeros = { 0 }; LD4x4_SH( p_src, src0, src1, src2, src3 ); AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 ); TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3 ); AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 ); SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 ); ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride ); ST_SH2( zeros, zeros, p_src, 8 ); } static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { int16_t i_dc; uint32_t i_src0, i_src1, i_src2, i_src3; v16u8 pred = { 0 }; v16i8 out; v8i16 input_dc, pred_r, pred_l; i_dc = ( p_src[0] + 32 ) >> 6; input_dc = __msa_fill_h( i_dc ); p_src[ 0 ] = 0; LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 ); INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred ); UNPCK_UB_SH( pred, pred_r, pred_l ); pred_r += input_dc; pred_l += input_dc; CLIP_SH2_0_255( pred_r, pred_l ); out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r ); ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride ); } static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 vec0, vec1, vec2, vec3; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16i8 zeros = { 0 }; p_src[ 0 ] += 32; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); vec0 = src0 + src4; vec1 = src0 - src4; vec2 = src2 >> 1; vec2 = vec2 - src6; vec3 = src6 >> 1; vec3 = src2 + vec3; BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 ); vec0 = src7 >> 1; vec0 = src5 - vec0 - src3 - src7; vec1 = src3 >> 1; vec1 = src1 - vec1 + src7 - src3; vec2 = src5 >> 1; vec2 = vec2 - src1 + src7 + src5; vec3 = src1 >> 1; vec3 = vec3 + src3 + src5 + src1; tmp4 = vec3 >> 2; tmp4 += vec0; tmp5 = vec2 >> 2; tmp5 += vec1; tmp6 = vec1 >> 2; tmp6 -= vec2; tmp7 = vec0 >> 2; tmp7 = vec3 - tmp7; BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, res0, res1, res2, res3, res4, res5, res6, res7 ); TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7, res0, res1, res2, res3, res4, res5, res6, res7 ); UNPCK_SH_SW( res0, tmp0_r, tmp0_l ); UNPCK_SH_SW( res1, tmp1_r, tmp1_l ); UNPCK_SH_SW( res2, tmp2_r, tmp2_l ); UNPCK_SH_SW( res3, tmp3_r, tmp3_l ); UNPCK_SH_SW( res4, tmp4_r, tmp4_l ); UNPCK_SH_SW( res5, tmp5_r, tmp5_l ); UNPCK_SH_SW( res6, tmp6_r, tmp6_l ); UNPCK_SH_SW( res7, tmp7_r, tmp7_l ); BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r ); vec2_r = tmp2_r >> 1; vec2_l = tmp2_l >> 1; vec2_r -= tmp6_r; vec2_l -= tmp6_l; vec3_r = tmp6_r >> 1; vec3_l = tmp6_l >> 1; vec3_r += tmp2_r; vec3_l += tmp2_l; BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r ); BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l ); vec0_r = tmp7_r >> 1; vec0_l = tmp7_l >> 1; vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; vec1_r = tmp3_r >> 1; vec1_l = tmp3_l >> 1; vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; vec2_r = tmp5_r >> 1; vec2_l = tmp5_l >> 1; vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; vec3_r = tmp1_r >> 1; vec3_l = tmp1_l >> 1; vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; tmp1_r = vec3_r >> 2; tmp1_l = vec3_l >> 2; tmp1_r += vec0_r; tmp1_l += vec0_l; tmp3_r = vec2_r >> 2; tmp3_l = vec2_l >> 2; tmp3_r += vec1_r; tmp3_l += vec1_l; tmp5_r = vec1_r >> 2; tmp5_l = vec1_l >> 2; tmp5_r -= vec2_r; tmp5_l -= vec2_l; tmp7_r = vec0_r >> 2; tmp7_l = vec0_l >> 2; tmp7_r = vec3_r - tmp7_r; tmp7_l = vec3_l - tmp7_l; BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r ); BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r ); BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r ); BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r ); SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 ); SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 ); SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 ); SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 ); PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, res0, res1, res2, res3 ); PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, res4, res5, res6, res7 ); LD_SB8( p_dst, i_dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, tmp0, tmp1, tmp2, tmp3 ); ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, tmp4, tmp5, tmp6, tmp7 ); ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, res0, res1, res2, res3 ); ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, res4, res5, res6, res7 ); CLIP_SH4_0_255( res0, res1, res2, res3 ); CLIP_SH4_0_255( res4, res5, res6, res7 ); PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, dst0, dst1, dst2, dst3 ); ST8x4_UB( dst0, dst1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); ST8x4_UB( dst2, dst3, p_dst, i_dst_stride ); } static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride, int16_t *p_dst, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3; v4i32 src0_r, src1_r, src2_r, src3_r; v4i32 hres0, hres1, hres2, hres3; v8i16 vres0, vres1, vres2, vres3; v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v2i64 res0, res1; LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); UNPCK_R_SH_SW( src0, src0_r ); UNPCK_R_SH_SW( src1, src1_r ); UNPCK_R_SH_SW( src2, src2_r ); UNPCK_R_SH_SW( src3, src3_r ); BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 ); BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 ); TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3 ); BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 ); BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 ); PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, vres0, vres1, vres2, vres3 ); PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 ); ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 ); } static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *pred_ptr, int32_t i_pred_stride ) { int16_t i_sum; uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_pred0, i_pred1, i_pred2, i_pred3; v16i8 src = { 0 }; v16i8 pred = { 0 }; v16u8 src_l0, src_l1; v8i16 diff0, diff1; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred ); ILVRL_B2_UB( src, pred, src_l0, src_l1 ); HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 ); i_sum = HADD_UH_U32( diff0 + diff1 ); return i_sum; } void x264_dct4x4dc_msa( int16_t d[16] ) { avc_dct4x4dc_msa( d, d, 4 ); } void x264_idct4x4dc_msa( int16_t d[16] ) { avc_idct4x4dc_msa( d, 4, d, 4 ); } void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] ) { avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE ); } void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] ) { avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE ); avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE ); avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0], &pi_dct[2][0], FDEC_STRIDE ); avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4], &pi_dct[3][0], FDEC_STRIDE ); } void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] ) { x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] ); x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] ); x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] ); x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] ); } void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] ) { avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE ); } void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] ) { avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE ); avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE ); avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[2][0], FDEC_STRIDE ); avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[3][0], FDEC_STRIDE ); } void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] ) { avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE ); avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE ); avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0], &pi_dct[2], FDEC_STRIDE ); avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4], &pi_dct[3], FDEC_STRIDE ); } void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] ) { for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE ) { avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE ); avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE ); avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE ); avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE ); } } void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref ) { avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst ); } void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref ) { avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE, &p_ref[0], FDEC_STRIDE, p_dst[0] ); avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4], FDEC_STRIDE, p_dst[1] ); avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0], FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0], FDEC_STRIDE, p_dst[2] ); avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4], FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4], FDEC_STRIDE, p_dst[3] ); } void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref ) { x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] ); x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] ); x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0], &p_ref[8*FDEC_STRIDE+0] ); x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8], &p_ref[8*FDEC_STRIDE+8] ); } void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1, uint8_t *p_pix2 ) { int32_t d0, d1, d2, d3; pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE, &p_pix2[0], FDEC_STRIDE ); pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE, &p_pix2[4], FDEC_STRIDE ); pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE, &p_pix2[4 * FDEC_STRIDE + 0], FDEC_STRIDE ); pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE, &p_pix2[4 * FDEC_STRIDE + 4], FDEC_STRIDE ); BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 ); BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] ); } void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1, uint8_t *p_pix2 ) { int32_t a0, a1, a2, a3, a4, a5, a6, a7; int32_t b0, b1, b2, b3, b4, b5, b6, b7; a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE, &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE ); a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE, &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE ); a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE, &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE ); a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE, &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE ); a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE, &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE ); a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE, &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE ); a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE, &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE ); a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE, &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE ); BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1, b0, b1, b2, b3, b7, b6, b5, b4 ); BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1, a0, a1, a2, a3, a7, a6, a5, a4 ); BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1, pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7], pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] ); } void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] ) { avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level ); } #endif x264-master/common/mips/dct.h000066400000000000000000000065061502133446700162510ustar00rootroot00000000000000/***************************************************************************** * dct.h: msa transform and zigzag ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Rishikesh More * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_DCT_H #define X264_MIPS_DCT_H #define x264_dct4x4dc_msa x264_template(dct4x4dc_msa) void x264_dct4x4dc_msa( int16_t d[16] ); #define x264_idct4x4dc_msa x264_template(idct4x4dc_msa) void x264_idct4x4dc_msa( int16_t d[16] ); #define x264_add4x4_idct_msa x264_template(add4x4_idct_msa) void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] ); #define x264_add8x8_idct_msa x264_template(add8x8_idct_msa) void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] ); #define x264_add16x16_idct_msa x264_template(add16x16_idct_msa) void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] ); #define x264_add8x8_idct8_msa x264_template(add8x8_idct8_msa) void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] ); #define x264_add16x16_idct8_msa x264_template(add16x16_idct8_msa) void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] ); #define x264_add8x8_idct_dc_msa x264_template(add8x8_idct_dc_msa) void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] ); #define x264_add16x16_idct_dc_msa x264_template(add16x16_idct_dc_msa) void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] ); #define x264_sub4x4_dct_msa x264_template(sub4x4_dct_msa) void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub8x8_dct_msa x264_template(sub8x8_dct_msa) void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub16x16_dct_msa x264_template(sub16x16_dct_msa) void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref ); #define x264_sub8x8_dct_dc_msa x264_template(sub8x8_dct_dc_msa) void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1, uint8_t *p_pix2 ); #define x264_sub8x16_dct_dc_msa x264_template(sub8x16_dct_dc_msa) void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1, uint8_t *p_pix2 ); #define x264_zigzag_scan_4x4_frame_msa x264_template(zigzag_scan_4x4_frame_msa) void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] ); #endif x264-master/common/mips/deblock-c.c000066400000000000000000002345311502133446700173160ustar00rootroot00000000000000/***************************************************************************** * deblock-c.c: msa deblocking ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Neha Rana * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macros.h" #include "deblock.h" #if !HIGH_BIT_DEPTH #define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in, \ q3_or_p3_org_in, p1_or_q1_org_in, \ p2_or_q2_org_in, q1_or_p1_org_in, \ p0_or_q0_out, p1_or_q1_out, p2_or_q2_out ) \ { \ v8i16 threshold; \ v8i16 const3 = __msa_ldi_h( 3 ); \ \ threshold = p0_or_q0_org_in + q3_or_p3_org_in; \ threshold += p1_or_q1_org_in; \ \ p0_or_q0_out = threshold << 1; \ p0_or_q0_out += p2_or_q2_org_in; \ p0_or_q0_out += q1_or_p1_org_in; \ p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 ); \ \ p1_or_q1_out = p2_or_q2_org_in + threshold; \ p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 ); \ \ p2_or_q2_out = p2_or_q2_org_in * const3; \ p2_or_q2_out += p3_or_q3_org_in; \ p2_or_q2_out += p3_or_q3_org_in; \ p2_or_q2_out += threshold; \ p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 ); \ } /* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */ #define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in, \ p1_or_q1_org_in, p0_or_q0_out ) \ { \ p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in; \ p0_or_q0_out += p1_or_q1_org_in; \ p0_or_q0_out += p1_or_q1_org_in; \ p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 ); \ } #define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in, \ p1_or_q1_org_in, p2_or_q2_org_in, \ negate_tc_in, tc_in, p1_or_q1_out ) \ { \ v8i16 clip3, temp; \ \ clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in, \ ( v8u16 ) q0_or_p0_org_in ); \ temp = p1_or_q1_org_in << 1; \ clip3 -= temp; \ clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 ); \ clip3 = CLIP_SH( clip3, negate_tc_in, tc_in ); \ p1_or_q1_out = p1_or_q1_org_in + clip3; \ } #define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in, \ p1_or_q1_org_in, q1_or_p1_org_in, \ negate_threshold_in, threshold_in, \ p0_or_q0_out, q0_or_p0_out ) \ { \ v8i16 q0_sub_p0, p1_sub_q1, delta; \ \ q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \ p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \ q0_sub_p0 <<= 2; \ p1_sub_q1 += 4; \ delta = q0_sub_p0 + p1_sub_q1; \ delta >>= 3; \ \ delta = CLIP_SH( delta, negate_threshold_in, threshold_in ); \ \ p0_or_q0_out = p0_or_q0_org_in + delta; \ q0_or_p0_out = q0_or_p0_org_in - delta; \ \ CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out ); \ } static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0; v16u8 alpha, beta; v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta; v16u8 p2, p1, p0, q0, q1, q2; v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; v8i16 p2_r = { 0 }; v8i16 p1_r = { 0 }; v8i16 p0_r = { 0 }; v8i16 q0_r = { 0 }; v8i16 q1_r = { 0 }; v8i16 q2_r = { 0 }; v8i16 p2_l = { 0 }; v8i16 p1_l = { 0 }; v8i16 p0_l = { 0 }; v8i16 q0_l = { 0 }; v8i16 q1_l = { 0 }; v8i16 q2_l = { 0 }; v16u8 tmp_flag; v16i8 zero = { 0 }; alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); LD_UB4( p_data - ( u_img_width << 1 ), u_img_width, p1_org, p0_org, q0_org, q1_org ); { v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha; p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; } if( !__msa_test_bz_v( is_less_than ) ) { q2_org = LD_UB( p_data + ( 2 * u_img_width ) ); p3_org = LD_UB( p_data - ( u_img_width << 2 ) ); p2_org = LD_UB( p_data - ( 3 * u_img_width ) ); UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); tmp_flag = alpha >> 2; tmp_flag = tmp_flag + 2; tmp_flag = ( p0_asub_q0 < tmp_flag ); p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); is_less_than_beta = ( p2_asub_p0 < beta ); is_less_than_beta = is_less_than_beta & tmp_flag; negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); is_less_than_beta = is_less_than_beta & is_less_than; negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; { v8u16 is_less_than_beta_l, is_less_than_beta_r; q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org ); is_less_than_beta_r = ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) { v8i16 p3_org_r; ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r, q0_org_r, p1_org_r, p2_r, q1_org_r, p0_r, p1_r, p2_r ); } q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org ); is_less_than_beta_l = ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) { v8i16 p3_org_l; ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l, q0_org_l, p1_org_l, p2_l, q1_org_l, p0_l, p1_l, p2_l ); } } /* combine and store */ if( !__msa_test_bz_v( is_less_than_beta ) ) { PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta ); p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta ); ST_UB( p1_org, p_data - ( 2 * u_img_width ) ); ST_UB( p2_org, p_data - ( 3 * u_img_width ) ); } { v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l; negate_is_less_than_beta_r = ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) ) { AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); } negate_is_less_than_beta_l = ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) negate_is_less_than_beta, 8 ); if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) ) { AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); } } if( !__msa_test_bz_v( negate_is_less_than_beta ) ) { p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r ); p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta ); } ST_UB( p0_org, p_data - u_img_width ); q3_org = LD_UB( p_data + ( 3 * u_img_width ) ); q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org ); is_less_than_beta = ( q2_asub_q0 < beta ); is_less_than_beta = is_less_than_beta & tmp_flag; negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); is_less_than_beta = is_less_than_beta & is_less_than; negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; { v8u16 is_less_than_beta_l, is_less_than_beta_r; is_less_than_beta_r = ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) { v8i16 q3_org_r; ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r, p0_org_r, q1_org_r, q2_r, p1_org_r, q0_r, q1_r, q2_r ); } is_less_than_beta_l = ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) { v8i16 q3_org_l; ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l, p0_org_l, q1_org_l, q2_l, p1_org_l, q0_l, q1_l, q2_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta ); q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta ); ST_UB( q1_org, p_data + u_img_width ); ST_UB( q2_org, p_data + 2 * u_img_width ); } { v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l; negate_is_less_than_beta_r = ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) ) { AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); } negate_is_less_than_beta_l = ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) negate_is_less_than_beta, 8 ); if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) ) { AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); } } if( !__msa_test_bz_v( negate_is_less_than_beta ) ) { q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r ); q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta ); } ST_UB( q0_org, p_data ); } } static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { uint8_t *p_src; v16u8 alpha, beta, p0_asub_q0; v16u8 is_less_than_alpha, is_less_than; v16u8 is_less_than_beta, negate_is_less_than_beta; v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; v8i16 p2_r = { 0 }; v8i16 p1_r = { 0 }; v8i16 p0_r = { 0 }; v8i16 q0_r = { 0 }; v8i16 q1_r = { 0 }; v8i16 q2_r = { 0 }; v8i16 p2_l = { 0 }; v8i16 p1_l = { 0 }; v8i16 p0_l = { 0 }; v8i16 q0_l = { 0 }; v8i16 q1_l = { 0 }; v8i16 q2_l = { 0 }; v16i8 zero = { 0 }; v16u8 tmp_flag; p_src = p_data - 4; { v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row8, row9, row10, row11, row12, row13, row14, row15; LD_UB8( p_src, u_img_width, row0, row1, row2, row3, row4, row5, row6, row7 ); LD_UB8( p_src + ( 8 * u_img_width ), u_img_width, row8, row9, row10, row11, row12, row13, row14, row15 ); TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org ); } UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l ); { v16u8 p1_asub_p0, q1_asub_q0; p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; } if( !__msa_test_bz_v( is_less_than ) ) { tmp_flag = alpha >> 2; tmp_flag = tmp_flag + 2; tmp_flag = ( p0_asub_q0 < tmp_flag ); { v16u8 p2_asub_p0; p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); is_less_than_beta = ( p2_asub_p0 < beta ); } is_less_than_beta = tmp_flag & is_less_than_beta; negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); is_less_than_beta = is_less_than_beta & is_less_than; negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; { v16u8 is_less_than_beta_r; is_less_than_beta_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( is_less_than_beta_r ) ) { v8i16 p3_org_r; ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r, q0_org_r, p1_org_r, p2_r, q1_org_r, p0_r, p1_r, p2_r ); } } { v16u8 is_less_than_beta_l; is_less_than_beta_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( is_less_than_beta_l ) ) { v8i16 p3_org_l; ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l, q0_org_l, p1_org_l, p2_l, q1_org_l, p0_l, p1_l, p2_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { v16u8 p0, p2, p1; PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta ); p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta ); } { v16u8 negate_is_less_than_beta_r; negate_is_less_than_beta_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( negate_is_less_than_beta_r ) ) { AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); } } { v16u8 negate_is_less_than_beta_l; negate_is_less_than_beta_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) negate_is_less_than_beta, 8 ); if( !__msa_test_bz_v( negate_is_less_than_beta_l ) ) { AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); } } if( !__msa_test_bz_v( negate_is_less_than_beta ) ) { v16u8 p0; p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r ); p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta ); } { v16u8 q2_asub_q0; q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org ); is_less_than_beta = ( q2_asub_q0 < beta ); } is_less_than_beta = is_less_than_beta & tmp_flag; negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); is_less_than_beta = is_less_than_beta & is_less_than; negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; { v16u8 is_less_than_beta_r; is_less_than_beta_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( is_less_than_beta_r ) ) { v8i16 q3_org_r; ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r, p0_org_r, q1_org_r, q2_r, p1_org_r, q0_r, q1_r, q2_r ); } } { v16u8 is_less_than_beta_l; is_less_than_beta_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( is_less_than_beta_l ) ) { v8i16 q3_org_l; ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l ); AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l, p0_org_l, q1_org_l, q2_l, p1_org_l, q0_l, q1_l, q2_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { v16u8 q0, q1, q2; PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta ); q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta ); } { v16u8 negate_is_less_than_beta_r; negate_is_less_than_beta_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( negate_is_less_than_beta_r ) ) { AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); } } { v16u8 negate_is_less_than_beta_l; negate_is_less_than_beta_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) negate_is_less_than_beta, 8 ); if( !__msa_test_bz_v( negate_is_less_than_beta_l ) ) { AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); } } if( !__msa_test_bz_v( negate_is_less_than_beta ) ) { v16u8 q0; q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r ); q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta ); } } { v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 ); ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 ); ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 ); ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 ); ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 ); p_src = p_data - 3; ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width ); ST2x4_UB( tmp2, 0, p_src + 4, u_img_width ); p_src += 4 * u_img_width; ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width ); ST2x4_UB( tmp2, 4, p_src + 4, u_img_width ); p_src += 4 * u_img_width; ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width ); ST2x4_UB( tmp5, 0, p_src + 4, u_img_width ); p_src += 4 * u_img_width; ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width ); ST2x4_UB( tmp5, 4, p_src + 4, u_img_width ); } } static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { v16u8 alpha, beta, is_less_than; v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org; v8i16 p0_r = { 0 }; v8i16 q0_r = { 0 }; v8i16 p0_l = { 0 }; v8i16 q0_l = { 0 }; alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width, p1_org, p0_org, q0_org, q1_org ); { v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; v16u8 is_less_than_alpha, is_less_than_beta; p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; } if( !__msa_test_bz_v( is_less_than ) ) { v16i8 zero = { 0 }; v16u8 is_less_than_r, is_less_than_l; is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 ); if( !__msa_test_bz_v( is_less_than_r ) ) { v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r ); AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); } is_less_than_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 ); if( !__msa_test_bz_v( is_less_than_l ) ) { v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l ); AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); } PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); ST_UB( p0_org, ( p_chroma - u_img_width ) ); ST_UB( q0_org, p_chroma ); } } static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { v16u8 is_less_than; v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org; v8i16 p0_r = { 0 }; v8i16 q0_r = { 0 }; v8i16 p0_l = { 0 }; v8i16 q0_l = { 0 }; v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org; v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org; v16i8 tmp0, tmp1, tmp2, tmp3; v4i32 vec0, vec1; v16u8 row0, row1, row2, row3, row4, row5, row6, row7; LD_UB8( ( p_chroma - 4 ), u_img_width, row0, row1, row2, row3, row4, row5, row6, row7 ); TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7, p1_u_org, p1_v_org, p0_u_org, p0_v_org, q0_u_org, q0_v_org, q1_u_org, q1_v_org ); ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org, q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org ); { v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta; p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; } if( !__msa_test_bz_v( is_less_than ) ) { v16u8 is_less_than_r, is_less_than_l; v16i8 zero = { 0 }; is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 ); if( !__msa_test_bz_v( is_less_than_r ) ) { v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r ); AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); } is_less_than_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 ); if( !__msa_test_bz_v( is_less_than_l ) ) { v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l ); AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); } PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 ); ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 ); ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 ); ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 ); ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width ); } } static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data, uint8_t u_bs0, uint8_t u_bs1, uint8_t u_bs2, uint8_t u_bs3, uint8_t u_tc0, uint8_t u_tc1, uint8_t u_tc2, uint8_t u_tc3, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { uint8_t *p_src; v16u8 beta, tmp_vec, bs = { 0 }; v16u8 tc = { 0 }; v16u8 is_less_than, is_less_than_beta; v16u8 p1, p0, q0, q1; v8i16 p0_r, q0_r, p1_r = { 0 }; v8i16 q1_r = { 0 }; v8i16 p0_l, q0_l, p1_l = { 0 }; v8i16 q1_l = { 0 }; v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r; v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l; v8i16 tc_r, tc_l; v16i8 zero = { 0 }; v16u8 is_bs_greater_than0; tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec ); if( !__msa_test_bz_v( bs ) ) { tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 ); tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 ); tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 ); tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 ); tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec ); is_bs_greater_than0 = ( zero < bs ); { v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16u8 row8, row9, row10, row11, row12, row13, row14, row15; p_src = p_data; p_src -= 4; LD_UB8( p_src, u_img_width, row0, row1, row2, row3, row4, row5, row6, row7 ); p_src += ( 8 * u_img_width ); LD_UB8( p_src, u_img_width, row8, row9, row10, row11, row12, row13, row14, row15 ); TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10, row11, row12, row13, row14, row15, p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org ); } { v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha; v16u8 is_less_than_alpha; p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; is_less_than = is_less_than & is_bs_greater_than0; } if( !__msa_test_bz_v( is_less_than ) ) { v16i8 negate_tc, sign_negate_tc; v8i16 negate_tc_r, i16_negatetc_l; negate_tc = zero - ( v16i8 ) tc; sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l ); UNPCK_UB_SH( tc, tc_r, tc_l ); UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); { v16u8 p2_asub_p0; v16u8 is_less_than_beta_r, is_less_than_beta_l; p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); is_less_than_beta = ( p2_asub_p0 < beta ); is_less_than_beta = is_less_than_beta & is_less_than; is_less_than_beta_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( is_less_than_beta_r ) ) { p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org ); AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r, negate_tc_r, tc_r, p1_r ); } is_less_than_beta_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( is_less_than_beta_l ) ) { p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org ); AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l, i16_negatetc_l, tc_l, p1_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r ); p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); tc = tc + is_less_than_beta; } { v16u8 u8_q2asub_q0; v16u8 is_less_than_beta_l, is_less_than_beta_r; u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org ); is_less_than_beta = ( u8_q2asub_q0 < beta ); is_less_than_beta = is_less_than_beta & is_less_than; q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org ); is_less_than_beta_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( is_less_than_beta_r ) ) { q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org ); AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r, negate_tc_r, tc_r, q1_r ); } q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org ); is_less_than_beta_l = ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( is_less_than_beta_l ) ) { q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org ); AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l, i16_negatetc_l, tc_l, q1_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r ); q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); tc = tc + is_less_than_beta; } { v8i16 threshold_r, negate_thresh_r; v8i16 threshold_l, negate_thresh_l; v16i8 negate_thresh, sign_negate_thresh; negate_thresh = zero - ( v16i8 ) tc; sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 ); ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh, threshold_r, negate_thresh_r ); AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_thresh_r, threshold_r, p0_r, q0_r ); threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc ); negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh, negate_thresh ); AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, negate_thresh_l, threshold_l, p0_l, q0_l ); } PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); } { v16i8 tp0, tp1, tp2, tp3; v8i16 tmp2, tmp5; v4i32 tmp3, tmp4, tmp6, tmp7; uint32_t u_out0, u_out2; uint16_t u_out1, u_out3; p_src = p_data - 3; ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 ); ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 ); ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 ); ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 ); ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 ); u_out0 = __msa_copy_u_w( tmp3, 0 ); u_out1 = __msa_copy_u_h( tmp2, 0 ); u_out2 = __msa_copy_u_w( tmp3, 1 ); u_out3 = __msa_copy_u_h( tmp2, 1 ); SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp3, 2 ); u_out1 = __msa_copy_u_h( tmp2, 2 ); u_out2 = __msa_copy_u_w( tmp3, 3 ); u_out3 = __msa_copy_u_h( tmp2, 3 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp4, 0 ); u_out1 = __msa_copy_u_h( tmp2, 4 ); u_out2 = __msa_copy_u_w( tmp4, 1 ); u_out3 = __msa_copy_u_h( tmp2, 5 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp4, 2 ); u_out1 = __msa_copy_u_h( tmp2, 6 ); u_out2 = __msa_copy_u_w( tmp4, 3 ); u_out3 = __msa_copy_u_h( tmp2, 7 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp6, 0 ); u_out1 = __msa_copy_u_h( tmp5, 0 ); u_out2 = __msa_copy_u_w( tmp6, 1 ); u_out3 = __msa_copy_u_h( tmp5, 1 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp6, 2 ); u_out1 = __msa_copy_u_h( tmp5, 2 ); u_out2 = __msa_copy_u_w( tmp6, 3 ); u_out3 = __msa_copy_u_h( tmp5, 3 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp7, 0 ); u_out1 = __msa_copy_u_h( tmp5, 4 ); u_out2 = __msa_copy_u_w( tmp7, 1 ); u_out3 = __msa_copy_u_h( tmp5, 5 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); u_out0 = __msa_copy_u_w( tmp7, 2 ); u_out1 = __msa_copy_u_h( tmp5, 6 ); u_out2 = __msa_copy_u_w( tmp7, 3 ); u_out3 = __msa_copy_u_h( tmp5, 7 ); p_src += u_img_width; SW( u_out0, p_src ); SH( u_out1, ( p_src + 4 ) ); p_src += u_img_width; SW( u_out2, p_src ); SH( u_out3, ( p_src + 4 ) ); } } } static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data, uint8_t u_bs0, uint8_t u_bs1, uint8_t u_bs2, uint8_t u_bs3, uint8_t u_tc0, uint8_t u_tc1, uint8_t u_tc2, uint8_t u_tc3, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_image_width ) { v16u8 p2_asub_p0, u8_q2asub_q0; v16u8 alpha, beta, is_less_than, is_less_than_beta; v16u8 p1, p0, q0, q1; v8i16 p1_r = { 0 }; v8i16 p0_r, q0_r, q1_r = { 0 }; v8i16 p1_l = { 0 }; v8i16 p0_l, q0_l, q1_l = { 0 }; v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r; v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l; v16i8 zero = { 0 }; v16u8 tmp_vec; v16u8 bs = { 0 }; v16i8 tc = { 0 }; tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 ); bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec ); if( !__msa_test_bz_v( bs ) ) { tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 ); tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 ); tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 ); tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec ); tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 ); tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec ); alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); LD_UB5( p_data - ( 3 * u_image_width ), u_image_width, p2_org, p1_org, p0_org, q0_org, q1_org ); { v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; v16u8 is_less_than_alpha, is_bs_greater_than0; is_bs_greater_than0 = ( ( v16u8 ) zero < bs ); p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; is_less_than = is_less_than & is_bs_greater_than0; } if( !__msa_test_bz_v( is_less_than ) ) { v16i8 sign_negate_tc, negate_tc; v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r; q2_org = LD_UB( p_data + ( 2 * u_image_width ) ); negate_tc = zero - tc; sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l ); UNPCK_UB_SH( tc, tc_r, tc_l ); UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); is_less_than_beta = ( p2_asub_p0 < beta ); is_less_than_beta = is_less_than_beta & is_less_than; { v8u16 is_less_than_beta_r, is_less_than_beta_l; is_less_than_beta_r = ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) { p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org ); AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r, negate_tc_r, tc_r, p1_r ); } is_less_than_beta_l = ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) { p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org ); AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l, i16_negatetc_l, tc_l, p1_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r ); p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); ST_UB( p1_org, p_data - ( 2 * u_image_width ) ); is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); tc = tc + ( v16i8 ) is_less_than_beta; } u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org ); is_less_than_beta = ( u8_q2asub_q0 < beta ); is_less_than_beta = is_less_than_beta & is_less_than; { v8u16 is_less_than_beta_r, is_less_than_beta_l; is_less_than_beta_r = ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) { q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org ); AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r, negate_tc_r, tc_r, q1_r ); } is_less_than_beta_l = ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) { q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org ); AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l, i16_negatetc_l, tc_l, q1_l ); } } if( !__msa_test_bz_v( is_less_than_beta ) ) { q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r ); q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); ST_UB( q1_org, p_data + u_image_width ); is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); tc = tc + ( v16i8 ) is_less_than_beta; } { v16i8 negate_thresh, sign_negate_thresh; v8i16 threshold_r, threshold_l; v8i16 negate_thresh_l, negate_thresh_r; negate_thresh = zero - tc; sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 ); ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh, threshold_r, negate_thresh_r ); AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_thresh_r, threshold_r, p0_r, q0_r ); threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc ); negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh, negate_thresh ); AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, negate_thresh_l, threshold_l, p0_l, q0_l ); } PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); ST_UB( p0_org, ( p_data - u_image_width ) ); ST_UB( q0_org, p_data ); } } } static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma, uint8_t u_bs0, uint8_t u_bs1, uint8_t u_bs2, uint8_t u_bs3, uint8_t u_tc0, uint8_t u_tc1, uint8_t u_tc2, uint8_t u_tc3, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { v16u8 alpha, beta; v4i32 tmp_vec, bs = { 0 }; v4i32 tc = { 0 }; v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; v16u8 is_less_than; v8i16 is_less_than_r, is_less_than_l; v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0; v16u8 p0, q0; v8i16 p0_r = { 0 }; v8i16 q0_r = { 0 }; v8i16 p0_l = { 0 }; v8i16 q0_l = { 0 }; v16u8 p1_org, p0_org, q0_org, q1_org; v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; v16i8 negate_tc, sign_negate_tc; v8i16 negate_tc_r, i16_negatetc_l; v8i16 tc_r, tc_l; v16i8 zero = { 0 }; v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 ); bs = __msa_insve_w( bs, 0, tmp_vec ); tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 ); bs = __msa_insve_w( bs, 1, tmp_vec ); tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 ); bs = __msa_insve_w( bs, 2, tmp_vec ); tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 ); bs = __msa_insve_w( bs, 3, tmp_vec ); if( !__msa_test_bz_v( ( v16u8 ) bs ) ) { tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 ); tc = __msa_insve_w( tc, 0, tmp_vec ); tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 ); tc = __msa_insve_w( tc, 1, tmp_vec ); tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 ); tc = __msa_insve_w( tc, 2, tmp_vec ); tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 ); tc = __msa_insve_w( tc, 3, tmp_vec ); is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs ); alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width, p1_org, p0_org, q0_org, q1_org ); p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; is_less_than = is_less_than & is_bs_greater_than0; if( !__msa_test_bz_v( is_less_than ) ) { negate_tc = zero - ( v16i8 ) tc; sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l ); UNPCK_UB_SH( tc, tc_r, tc_l ); UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l ); is_less_than_r = ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) ) { AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, tc_r, p0_r, q0_r ); } is_less_than_l = ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) ) { AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, i16_negatetc_l, tc_l, p0_l, q0_l ); } PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); ST_UB( p0_org, p_chroma - u_img_width ); ST_UB( q0_org, p_chroma ); } } } static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma, uint8_t u_bs0, uint8_t u_bs1, uint8_t u_bs2, uint8_t u_bs3, uint8_t u_tc0, uint8_t u_tc1, uint8_t u_tc2, uint8_t u_tc3, uint8_t u_alpha_in, uint8_t u_beta_in, uint32_t u_img_width ) { v16u8 alpha, beta; v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0; v16u8 is_less_than, is_less_than1; v8i16 is_less_than_r, is_less_than_l; v16u8 is_less_than_beta, is_less_than_alpha; v8i16 p0_r = { 0 }; v8i16 q0_r = { 0 }; v8i16 p0_l = { 0 }; v8i16 q0_l = { 0 }; v16u8 p1_org, p0_org, q0_org, q1_org; v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; v16u8 is_bs_less_than4, is_bs_greater_than0; v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l; v16u8 const4; v16i8 zero = { 0 }; v8i16 tmp_vec, bs = { 0 }; v8i16 tc = { 0 }; v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org; v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org; v16i8 tmp0, tmp1, tmp2, tmp3; v4i32 vec0, vec1; v16u8 row0, row1, row2, row3, row4, row5, row6, row7; v16i8 negate_tc, sign_negate_tc; const4 = ( v16u8 ) __msa_ldi_b( 4 ); tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 ); bs = __msa_insve_h( bs, 0, tmp_vec ); bs = __msa_insve_h( bs, 4, tmp_vec ); tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 ); bs = __msa_insve_h( bs, 1, tmp_vec ); bs = __msa_insve_h( bs, 5, tmp_vec ); tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 ); bs = __msa_insve_h( bs, 2, tmp_vec ); bs = __msa_insve_h( bs, 6, tmp_vec ); tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 ); bs = __msa_insve_h( bs, 3, tmp_vec ); bs = __msa_insve_h( bs, 7, tmp_vec ); if( !__msa_test_bz_v( ( v16u8 ) bs ) ) { tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 ); tc = __msa_insve_h( tc, 0, tmp_vec ); tc = __msa_insve_h( tc, 4, tmp_vec ); tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 ); tc = __msa_insve_h( tc, 1, tmp_vec ); tc = __msa_insve_h( tc, 5, tmp_vec ); tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 ); tc = __msa_insve_h( tc, 2, tmp_vec ); tc = __msa_insve_h( tc, 6, tmp_vec ); tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 ); tc = __msa_insve_h( tc, 3, tmp_vec ); tc = __msa_insve_h( tc, 7, tmp_vec ); is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs ); LD_UB8( ( p_chroma - 4 ), u_img_width, row0, row1, row2, row3, row4, row5, row6, row7 ); TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7, p1_u_org, p1_v_org, p0_u_org, p0_v_org, q0_u_org, q0_v_org, q1_u_org, q1_v_org ); ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org, q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org ); p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); beta = ( v16u8 ) __msa_fill_b( u_beta_in ); is_less_than_alpha = ( p0_asub_q0 < alpha ); is_less_than_beta = ( p1_asub_p0 < beta ); is_less_than = is_less_than_beta & is_less_than_alpha; is_less_than_beta = ( q1_asub_q0 < beta ); is_less_than = is_less_than_beta & is_less_than; is_less_than = is_bs_greater_than0 & is_less_than; if( !__msa_test_bz_v( is_less_than ) ) { UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l ); is_bs_less_than4 = ( ( v16u8 ) bs < const4 ); is_less_than1 = is_less_than & is_bs_less_than4; if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) ) { negate_tc = zero - ( v16i8 ) tc; sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l ); UNPCK_UB_SH( tc, tc_r, tc_l ); is_less_than_r = ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) ) { AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r, tc_r, p0_r, q0_r ); } is_less_than_l = ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 ); if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) ) { AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, i16_negatetc_l, tc_l, p0_l, q0_l ); } PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 ); q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 ); } SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 ); ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 ); ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 ); ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 ); ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width ); } } } static void avc_deblock_strength_msa( uint8_t *nnz, int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE], int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t pu_bs[2][8][4], int32_t i_mvy_limit ) { uint32_t u_tmp; v16u8 nnz0, nnz1, nnz2, nnz3, nnz4; v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 }; v16i8 ref0, ref1, ref2, ref3, ref4; v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5; v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b; v8u16 four, mvy_limit_vec, sub0, sub1; nnz0 = LD_UB( nnz + 4 ); nnz2 = LD_UB( nnz + 20 ); nnz4 = LD_UB( nnz + 36 ); ref0 = LD_SB( pi_ref[0] + 4 ); ref2 = LD_SB( pi_ref[0] + 20 ); ref4 = LD_SB( pi_ref[0] + 36 ); mv0 = LD_SH( ( pi_mv[0] + 4 )[0] ); mv1 = LD_SH( ( pi_mv[0] + 12 )[0] ); mv2 = LD_SH( ( pi_mv[0] + 20 )[0] ); mv3 = LD_SH( ( pi_mv[0] + 28 )[0] ); mv4 = LD_SH( ( pi_mv[0] + 36 )[0] ); mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit ); four = ( v8u16 ) __msa_fill_h( 4 ); mask = ( v16u8 ) __msa_ldi_b( 0 ); one = ( v16u8 ) __msa_ldi_b( 1 ); two = ( v16u8 ) __msa_ldi_b( 2 ); mv5 = __msa_pckod_h( mv0, mv0 ); mv6 = __msa_pckod_h( mv1, mv1 ); mv_a = __msa_pckev_h( mv0, mv0 ); mv_b = __msa_pckev_h( mv1, mv1 ); nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 ); ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 ); nnz_mask = nnz0 | nnz1; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[1][0] ); dst = ( v16u8 ) __msa_ldi_b( 0 ); two = ( v16u8 ) __msa_ldi_b( 2 ); mv5 = __msa_pckod_h( mv1, mv1 ); mv6 = __msa_pckod_h( mv2, mv2 ); mv_a = __msa_pckev_h( mv1, mv1 ); mv_b = __msa_pckev_h( mv2, mv2 ); nnz_mask = nnz2 | nnz1; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[1][1] ); dst = ( v16u8 ) __msa_ldi_b( 0 ); two = ( v16u8 ) __msa_ldi_b( 2 ); mv5 = __msa_pckod_h( mv2, mv2 ); mv6 = __msa_pckod_h( mv3, mv3 ); mv_a = __msa_pckev_h( mv2, mv2 ); mv_b = __msa_pckev_h( mv3, mv3 ); nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 ); ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 ); nnz_mask = nnz3 | nnz2; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[1][2] ); dst = ( v16u8 ) __msa_ldi_b( 0 ); two = ( v16u8 ) __msa_ldi_b( 2 ); mv5 = __msa_pckod_h( mv3, mv3 ); mv6 = __msa_pckod_h( mv4, mv4 ); mv_a = __msa_pckev_h( mv3, mv3 ); mv_b = __msa_pckev_h( mv4, mv4 ); nnz_mask = nnz4 | nnz3; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[1][3] ); nnz0 = LD_UB( nnz + 8 ); nnz2 = LD_UB( nnz + 24 ); ref0 = LD_SB( pi_ref[0] + 8 ); ref2 = LD_SB( pi_ref[0] + 24 ); mv0 = LD_SH( ( pi_mv[0] + 8 )[0] ); mv1 = LD_SH( ( pi_mv[0] + 12 )[0] ); mv2 = LD_SH( ( pi_mv[0] + 16 )[0] ); mv3 = LD_SH( ( pi_mv[0] + 20 )[0] ); mv4 = LD_SH( ( pi_mv[0] + 24 )[0] ); mv7 = LD_SH( ( pi_mv[0] + 28 )[0] ); mv8 = LD_SH( ( pi_mv[0] + 32 )[0] ); mv9 = LD_SH( ( pi_mv[0] + 36 )[0] ); nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 ); nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 ); ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 ); ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 ); nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 ); nnz1 = ( v16u8 ) temp_vec4; nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 ); nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 ); nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 ); ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 ); ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 ); ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 ); ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 ); ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 ); ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 ); ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 ); ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 ); TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 ); TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 ); mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit ); four = ( v8u16 ) __msa_fill_h( 4 ); mask = ( v16u8 ) __msa_ldi_b( 0 ); one = ( v16u8 ) __msa_ldi_b( 1 ); two = ( v16u8 ) __msa_ldi_b( 2 ); dst = ( v16u8 ) __msa_ldi_b( 0 ); mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 ); mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 ); mv_a = mv0; mv_b = mv1; nnz_mask = nnz0 | nnz1; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[0][0] ); two = ( v16u8 ) __msa_ldi_b( 2 ); dst = ( v16u8 ) __msa_ldi_b( 0 ); mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 ); mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 ); mv_a = mv1; mv_b = mv2; nnz_mask = nnz1 | nnz2; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[0][1] ); two = ( v16u8 ) __msa_ldi_b( 2 ); dst = ( v16u8 ) __msa_ldi_b( 0 ); mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 ); mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 ); mv_a = mv2; mv_b = mv3; nnz_mask = nnz2 | nnz3; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[0][2] ); two = ( v16u8 ) __msa_ldi_b( 2 ); dst = ( v16u8 ) __msa_ldi_b( 0 ); mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 ); mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 ); mv_a = mv3; mv_b = mv4; nnz_mask = nnz3 | nnz4; nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); two = __msa_bmnz_v( two, mask, nnz_mask ); ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 ); ref_mask = ref_mask ^ 255; sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); dst = __msa_bmnz_v( dst, one, ref_mask ); dst = __msa_bmnz_v( two, dst, nnz_mask ); u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); SW( u_tmp, pu_bs[0][3] ); } void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta ) { avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha, ( uint8_t ) i_beta, i_stride ); } void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta ) { avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha, ( uint8_t ) i_beta, i_stride ); } void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta ) { avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha, ( uint8_t ) i_beta, i_stride ); } void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta ) { avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha, ( uint8_t ) i_beta, i_stride ); } void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) { uint8_t u_bs0 = 1; uint8_t u_bs1 = 1; uint8_t u_bs2 = 1; uint8_t u_bs3 = 1; if( p_tc0[0] < 0 ) u_bs0 = 0; if( p_tc0[1] < 0 ) u_bs1 = 0; if( p_tc0[2] < 0 ) u_bs2 = 0; if( p_tc0[3] < 0 ) u_bs3 = 0; avc_loopfilter_luma_inter_edge_ver_msa( p_pix, u_bs0, u_bs1, u_bs2, u_bs3, p_tc0[0], p_tc0[1], p_tc0[2], p_tc0[3], i_alpha, i_beta, i_stride ); } void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) { uint8_t u_bs0 = 1; uint8_t u_bs1 = 1; uint8_t u_bs2 = 1; uint8_t u_bs3 = 1; if( p_tc0[0] < 0 ) u_bs0 = 0; if( p_tc0[1] < 0 ) u_bs1 = 0; if( p_tc0[2] < 0 ) u_bs2 = 0; if( p_tc0[3] < 0 ) u_bs3 = 0; avc_loopfilter_luma_inter_edge_hor_msa( p_pix, u_bs0, u_bs1, u_bs2, u_bs3, p_tc0[0], p_tc0[1], p_tc0[2], p_tc0[3], i_alpha, i_beta, i_stride ); } void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) { uint8_t u_bs0 = 1; uint8_t u_bs1 = 1; uint8_t u_bs2 = 1; uint8_t u_bs3 = 1; if( p_tc0[0] < 0 ) u_bs0 = 0; if( p_tc0[1] < 0 ) u_bs1 = 0; if( p_tc0[2] < 0 ) u_bs2 = 0; if( p_tc0[3] < 0 ) u_bs3 = 0; avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix, u_bs0, u_bs1, u_bs2, u_bs3, p_tc0[0], p_tc0[1], p_tc0[2], p_tc0[3], i_alpha, i_beta, i_stride ); } void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride, int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) { uint8_t u_bs0 = 1; uint8_t u_bs1 = 1; uint8_t u_bs2 = 1; uint8_t u_bs3 = 1; if( p_tc0[0] < 0 ) u_bs0 = 0; if( p_tc0[1] < 0 ) u_bs1 = 0; if( p_tc0[2] < 0 ) u_bs2 = 0; if( p_tc0[3] < 0 ) u_bs3 = 0; avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix, u_bs0, u_bs1, u_bs2, u_bs3, p_tc0[0], p_tc0[1], p_tc0[2], p_tc0[3], i_alpha, i_beta, i_stride ); } void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE], int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE], int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t pu_bs[2][8][4], int32_t i_mvy_limit, int32_t i_bframe ) { if( i_bframe ) { for( int32_t i_dir = 0; i_dir < 2; i_dir++ ) { int32_t s1 = i_dir ? 1 : 8; int32_t s2 = i_dir ? 8 : 1; for( int32_t i_edge = 0; i_edge < 4; i_edge++ ) { for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4; i++, loc += s1 ) { int32_t locn = loc - s2; if( u_nnz[loc] || u_nnz[locn] ) { pu_bs[i_dir][i_edge][i] = 2; } else if( pi_ref[0][loc] != pi_ref[0][locn] || abs( pi_mv[0][loc][0] - pi_mv[0][locn][0] ) >= 4 || abs( pi_mv[0][loc][1] - pi_mv[0][locn][1] ) >= i_mvy_limit || ( i_bframe && ( pi_ref[1][loc] != pi_ref[1][locn] || abs( pi_mv[1][loc][0] - pi_mv[1][locn][0] ) >= 4 || abs( pi_mv[1][loc][1] - pi_mv[1][locn][1] ) >= i_mvy_limit ) ) ) { pu_bs[i_dir][i_edge][i] = 1; } else { pu_bs[i_dir][i_edge][i] = 0; } } } } } else { avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit ); } } #endif x264-master/common/mips/deblock.h000066400000000000000000000056411502133446700171010ustar00rootroot00000000000000/***************************************************************************** * deblock.h: msa deblocking ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_DEBLOCK_H #define X264_MIPS_DEBLOCK_H #if !HIGH_BIT_DEPTH #define x264_deblock_v_luma_msa x264_template(deblock_v_luma_msa) void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_msa x264_template(deblock_h_luma_msa) void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_chroma_msa x264_template(deblock_v_chroma_msa) void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_msa x264_template(deblock_h_chroma_msa) void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_luma_intra_msa x264_template(deblock_v_luma_intra_msa) void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_msa x264_template(deblock_h_luma_intra_msa) void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_msa x264_template(deblock_v_chroma_intra_msa) void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_msa x264_template(deblock_h_chroma_intra_msa) void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_strength_msa x264_template(deblock_strength_msa) void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #endif #endif x264-master/common/mips/macros.h000066400000000000000000003045071502133446700167650ustar00rootroot00000000000000/***************************************************************************** * macros.h: msa macros ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Rishikesh More * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_MACROS_H #define X264_MIPS_MACROS_H #include #include #define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) #define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ ) #define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ ) #define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) #define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ ) #define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) #define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ ) #define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in ) #define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ ) #define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ ) #define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in ) #define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ ) #define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ ) #if ( __mips_isa_rev >= 6 ) #define LH( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint16_t u_val_h_m; \ \ asm volatile ( \ "lh %[u_val_h_m], %[p_src_m] \n\t" \ \ : [u_val_h_m] "=r" ( u_val_h_m ) \ : [p_src_m] "m" ( *p_src_m ) \ ); \ \ u_val_h_m; \ } ) #define LW( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint32_t u_val_w_m; \ \ asm volatile ( \ "lw %[u_val_w_m], %[p_src_m] \n\t" \ \ : [u_val_w_m] "=r" ( u_val_w_m ) \ : [p_src_m] "m" ( *p_src_m ) \ ); \ \ u_val_w_m; \ } ) #if ( __mips == 64 ) #define LD( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint64_t u_val_d_m = 0; \ \ asm volatile ( \ "ld %[u_val_d_m], %[p_src_m] \n\t" \ \ : [u_val_d_m] "=r" ( u_val_d_m ) \ : [p_src_m] "m" ( *p_src_m ) \ ); \ \ u_val_d_m; \ } ) #else // !( __mips == 64 ) #define LD( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint32_t u_val0_m, u_val1_m; \ uint64_t u_val_d_m = 0; \ \ u_val0_m = LW( p_src_m ); \ u_val1_m = LW( p_src_m + 4 ); \ \ u_val_d_m = ( uint64_t ) ( u_val1_m ); \ u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \ 0xFFFFFFFF00000000 ); \ u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \ \ u_val_d_m; \ } ) #endif // ( __mips == 64 ) #define SH( u_val, p_dst ) \ { \ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ uint16_t u_val_h_m = ( u_val ); \ \ asm volatile ( \ "sh %[u_val_h_m], %[p_dst_m] \n\t" \ \ : [p_dst_m] "=m" ( *p_dst_m ) \ : [u_val_h_m] "r" ( u_val_h_m ) \ ); \ } #define SW( u_val, p_dst ) \ { \ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ uint32_t u_val_w_m = ( u_val ); \ \ asm volatile ( \ "sw %[u_val_w_m], %[p_dst_m] \n\t" \ \ : [p_dst_m] "=m" ( *p_dst_m ) \ : [u_val_w_m] "r" ( u_val_w_m ) \ ); \ } #define SD( u_val, p_dst ) \ { \ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ uint64_t u_val_d_m = ( u_val ); \ \ asm volatile ( \ "sd %[u_val_d_m], %[p_dst_m] \n\t" \ \ : [p_dst_m] "=m" ( *p_dst_m ) \ : [u_val_d_m] "r" ( u_val_d_m ) \ ); \ } #else // !( __mips_isa_rev >= 6 ) #define LH( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint16_t u_val_h_m; \ \ asm volatile ( \ "ulh %[u_val_h_m], %[p_src_m] \n\t" \ \ : [u_val_h_m] "=r" ( u_val_h_m ) \ : [p_src_m] "m" ( *p_src_m ) \ ); \ \ u_val_h_m; \ } ) #define LW( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint32_t u_val_w_m; \ \ asm volatile ( \ "ulw %[u_val_w_m], %[p_src_m] \n\t" \ \ : [u_val_w_m] "=r" ( u_val_w_m ) \ : [p_src_m] "m" ( *p_src_m ) \ ); \ \ u_val_w_m; \ } ) #if ( __mips == 64 ) #define LD( p_src ) \ ( { \ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ uint64_t u_val_d_m = 0; \ \ asm volatile ( \ "uld %[u_val_d_m], %[p_src_m] \n\t" \ \ : [u_val_d_m] "=r" ( u_val_d_m ) \ : [p_src_m] "m" ( *p_src_m ) \ ); \ \ u_val_d_m; \ } ) #else // !( __mips == 64 ) #define LD( p_src ) \ ( { \ uint8_t *psrc_m1 = ( uint8_t * ) ( p_src ); \ uint32_t u_val0_m, u_val1_m; \ uint64_t u_val_d_m = 0; \ \ u_val0_m = LW( psrc_m1 ); \ u_val1_m = LW( psrc_m1 + 4 ); \ \ u_val_d_m = ( uint64_t ) ( u_val1_m ); \ u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \ 0xFFFFFFFF00000000 ); \ u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \ \ u_val_d_m; \ } ) #endif // ( __mips == 64 ) #define SH( u_val, p_dst ) \ { \ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ uint16_t u_val_h_m = ( u_val ); \ \ asm volatile ( \ "ush %[u_val_h_m], %[p_dst_m] \n\t" \ \ : [p_dst_m] "=m" ( *p_dst_m ) \ : [u_val_h_m] "r" ( u_val_h_m ) \ ); \ } #define SW( u_val, p_dst ) \ { \ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ uint32_t u_val_w_m = ( u_val ); \ \ asm volatile ( \ "usw %[u_val_w_m], %[p_dst_m] \n\t" \ \ : [p_dst_m] "=m" ( *p_dst_m ) \ : [u_val_w_m] "r" ( u_val_w_m ) \ ); \ } #define SD( u_val, p_dst ) \ { \ uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst ); \ uint32_t u_val0_m, u_val1_m; \ \ u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF ); \ u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF ); \ \ SW( u_val0_m, p_dst_m1 ); \ SW( u_val1_m, p_dst_m1 + 4 ); \ } #endif // ( __mips_isa_rev >= 6 ) /* Description : Load 4 words with stride Arguments : Inputs - psrc (source pointer to load from) - stride Outputs - out0, out1, out2, out3 Details : Load word in 'out0' from (psrc) Load word in 'out1' from (psrc + stride) Load word in 'out2' from (psrc + 2 * stride) Load word in 'out3' from (psrc + 3 * stride) */ #define LW4( p_src, stride, out0, out1, out2, out3 ) \ { \ out0 = LW( ( p_src ) ); \ out1 = LW( ( p_src ) + stride ); \ out2 = LW( ( p_src ) + 2 * stride ); \ out3 = LW( ( p_src ) + 3 * stride ); \ } /* Description : Store 4 words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Store word from 'in0' to (pdst) Store word from 'in1' to (pdst + stride) Store word from 'in2' to (pdst + 2 * stride) Store word from 'in3' to (pdst + 3 * stride) */ #define SW4( in0, in1, in2, in3, p_dst, stride ) \ { \ SW( in0, ( p_dst ) ) \ SW( in1, ( p_dst ) + stride ); \ SW( in2, ( p_dst ) + 2 * stride ); \ SW( in3, ( p_dst ) + 3 * stride ); \ } /* Description : Store 4 double words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Store double word from 'in0' to (pdst) Store double word from 'in1' to (pdst + stride) Store double word from 'in2' to (pdst + 2 * stride) Store double word from 'in3' to (pdst + 3 * stride) */ #define SD4( in0, in1, in2, in3, p_dst, stride ) \ { \ SD( in0, ( p_dst ) ) \ SD( in1, ( p_dst ) + stride ); \ SD( in2, ( p_dst ) + 2 * stride ); \ SD( in3, ( p_dst ) + 3 * stride ); \ } /* Description : Load vectors with 16 byte elements with stride Arguments : Inputs - psrc (source pointer to load from) - stride Outputs - out0, out1 Return Type - as per RTYPE Details : Load 16 byte elements in 'out0' from (psrc) Load 16 byte elements in 'out1' from (psrc + stride) */ #define LD_B2( RTYPE, p_src, stride, out0, out1 ) \ { \ out0 = LD_B( RTYPE, ( p_src ) ); \ out1 = LD_B( RTYPE, ( p_src ) + stride ); \ } #define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ ) #define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ ) #define LD_B3( RTYPE, p_src, stride, out0, out1, out2 ) \ { \ LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \ out2 = LD_B( RTYPE, ( p_src ) + 2 * stride ); \ } #define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ ) #define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ ) #define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \ { \ LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \ LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 ); \ } #define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ ) #define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ ) #define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 ) \ { \ LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ out4 = LD_B( RTYPE, ( p_src ) + 4 * stride ); \ } #define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ ) #define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ ) #define LD_B8( RTYPE, p_src, stride, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \ } #define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ ) #define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ ) /* Description : Load vectors with 8 halfword elements with stride Arguments : Inputs - psrc (source pointer to load from) - stride Outputs - out0, out1 Details : Load 8 halfword elements in 'out0' from (psrc) Load 8 halfword elements in 'out1' from (psrc + stride) */ #define LD_H2( RTYPE, p_src, stride, out0, out1 ) \ { \ out0 = LD_H( RTYPE, ( p_src ) ); \ out1 = LD_H( RTYPE, ( p_src ) + ( stride ) ); \ } #define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ ) #define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \ { \ LD_H2( RTYPE, ( p_src ), stride, out0, out1 ); \ LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 ); \ } #define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ ) #define LD_H8( RTYPE, p_src, stride, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \ } #define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ ) /* Description : Load 4x4 block of signed halfword elements from 1D source data into 4 vectors (Each vector with 4 signed halfwords) Arguments : Inputs - psrc Outputs - out0, out1, out2, out3 */ #define LD4x4_SH( p_src, out0, out1, out2, out3 ) \ { \ out0 = LD_SH( p_src ); \ out2 = LD_SH( p_src + 8 ); \ out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \ out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 ); \ } /* Description : Load 2 vectors of signed word elements with stride Arguments : Inputs - psrc (source pointer to load from) - stride Outputs - out0, out1 Return Type - signed word */ #define LD_SW2( p_src, stride, out0, out1 ) \ { \ out0 = LD_SW( ( p_src ) ); \ out1 = LD_SW( ( p_src ) + stride ); \ } /* Description : Store vectors of 16 byte elements with stride Arguments : Inputs - in0, in1, stride - pdst (destination pointer to store to) Details : Store 16 byte elements from 'in0' to (pdst) Store 16 byte elements from 'in1' to (pdst + stride) */ #define ST_B2( RTYPE, in0, in1, p_dst, stride ) \ { \ ST_B( RTYPE, in0, ( p_dst ) ); \ ST_B( RTYPE, in1, ( p_dst ) + stride ); \ } #define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ ) #define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \ { \ ST_B2( RTYPE, in0, in1, ( p_dst ), stride ); \ ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \ } #define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ ) #define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ ) #define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ p_dst, stride ) \ { \ ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ); \ ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \ } #define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ ) /* Description : Store vectors of 8 halfword elements with stride Arguments : Inputs - in0, in1, stride - pdst (destination pointer to store to) Details : Store 8 halfword elements from 'in0' to (pdst) Store 8 halfword elements from 'in1' to (pdst + stride) */ #define ST_H2( RTYPE, in0, in1, p_dst, stride ) \ { \ ST_H( RTYPE, in0, ( p_dst ) ); \ ST_H( RTYPE, in1, ( p_dst ) + stride ); \ } #define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ ) #define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \ { \ ST_H2( RTYPE, in0, in1, ( p_dst ), stride ); \ ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \ } #define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ ) #define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride ) \ { \ ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride ); \ ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \ } #define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ ) /* Description : Store 2x4 byte block to destination memory from input vector Arguments : Inputs - in, stidx, pdst, stride Details : Index 'stidx' halfword element from 'in' vector is copied to GP register and stored to (pdst) Index 'stidx+1' halfword element from 'in' vector is copied to GP register and stored to (pdst + stride) Index 'stidx+2' halfword element from 'in' vector is copied to GP register and stored to (pdst + 2 * stride) Index 'stidx+3' halfword element from 'in' vector is copied to GP register and stored to (pdst + 3 * stride) */ #define ST2x4_UB( in, stidx, p_dst, stride ) \ { \ uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst ); \ \ u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) ); \ u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) ); \ u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) ); \ u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) ); \ \ SH( u_out0_m, pblk_2x4_m ); \ SH( u_out1_m, pblk_2x4_m + stride ); \ SH( u_out2_m, pblk_2x4_m + 2 * stride ); \ SH( u_out3_m, pblk_2x4_m + 3 * stride ); \ } /* Description : Store 4x4 byte block to destination memory from input vector Arguments : Inputs - in0, in1, pdst, stride Details : 'Idx0' word element from input vector 'in0' is copied to GP register and stored to (pdst) 'Idx1' word element from input vector 'in0' is copied to GP register and stored to (pdst + stride) 'Idx2' word element from input vector 'in0' is copied to GP register and stored to (pdst + 2 * stride) 'Idx3' word element from input vector 'in0' is copied to GP register and stored to (pdst + 3 * stride) */ #define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride ) \ { \ uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst ); \ \ u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 ); \ u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 ); \ u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 ); \ u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 ); \ \ SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride ); \ } #define ST4x8_UB( in0, in1, p_dst, stride ) \ { \ uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst ); \ \ ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride ); \ ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride ); \ } /* Description : Store 8x1 byte block to destination memory from input vector Arguments : Inputs - in, pdst Details : Index 0 double word element from 'in' vector is copied to GP register and stored to (pdst) */ #define ST8x1_UB( in, p_dst ) \ { \ uint64_t u_out0_m; \ u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 ); \ SD( u_out0_m, p_dst ); \ } /* Description : Store 8x4 byte block to destination memory from input vectors Arguments : Inputs - in0, in1, pdst, stride Details : Index 0 double word element from 'in0' vector is copied to GP register and stored to (pdst) Index 1 double word element from 'in0' vector is copied to GP register and stored to (pdst + stride) Index 0 double word element from 'in1' vector is copied to GP register and stored to (pdst + 2 * stride) Index 1 double word element from 'in1' vector is copied to GP register and stored to (pdst + 3 * stride) */ #define ST8x4_UB( in0, in1, p_dst, stride ) \ { \ uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst ); \ \ u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 ); \ u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 ); \ u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 ); \ u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 ); \ \ SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride ); \ } /* Description : average with rounding (in0 + in1 + 1) / 2. Arguments : Inputs - in0, in1, in2, in3, Outputs - out0, out1 Return Type - as per RTYPE Details : Each unsigned byte element from 'in0' vector is added with each unsigned byte element from 'in1' vector. Average with rounding is calculated and written to 'out0' */ #define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 ); \ out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 ); \ } #define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ ) #define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 ) \ } #define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ ) /* Description : Immediate number of elements to slide with zero Arguments : Inputs - in0, in1, slide_val Outputs - out0, out1 Return Type - as per RTYPE Details : Byte elements from 'zero_m' vector are slide into 'in0' by value specified in 'slide_val' */ #define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val ) \ { \ v16i8 zero_m = { 0 }; \ out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \ ( v16i8 ) in0, slide_val ); \ out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \ ( v16i8 ) in1, slide_val ); \ } #define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ ) /* Description : Immediate number of elements to slide Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val Outputs - out0, out1 Return Type - as per RTYPE Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by value specified in 'slide_val' */ #define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val ) \ { \ out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0, \ slide_val ); \ out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1, \ slide_val ); \ } #define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ ) /* Description : Shuffle byte vector elements as per mask vector Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 Outputs - out0, out1 Return Type - as per RTYPE Details : Selective byte elements from 'in0' & 'in1' are copied to 'out0' as per control vector 'mask0' */ #define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0, \ ( v16i8 ) in1, ( v16i8 ) in0 ); \ out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1, \ ( v16i8 ) in3, ( v16i8 ) in2 ); \ } #define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ ) #define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ ) /* Description : Shuffle halfword vector elements as per mask vector Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 Outputs - out0, out1 Return Type - as per RTYPE Details : Selective byte elements from 'in0' & 'in1' are copied to 'out0' as per control vector 'mask0' */ #define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0, \ ( v8i16 ) in1, ( v8i16 ) in0 ); \ out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1, \ ( v8i16 ) in3, ( v8i16 ) in2 ); \ } #define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ ) /* Description : Dot product of byte vector elements Arguments : Inputs - mult0, mult1 cnst0, cnst1 Outputs - out0, out1 Return Type - as per RTYPE Details : Unsigned byte elements from 'mult0' are multiplied with unsigned byte elements from 'cnst0' producing a result twice the size of input i.e. unsigned halfword. Multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ #define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 ); \ out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 ); \ } #define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ ) #define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3, \ cnst0, cnst1, cnst2, cnst3, \ out0, out1, out2, out3 ) \ { \ DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \ DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \ } #define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ ) /* Description : Dot product of byte vector elements Arguments : Inputs - mult0, mult1 cnst0, cnst1 Outputs - out0, out1 Return Type - as per RTYPE Details : Signed byte elements from 'mult0' are multiplied with signed byte elements from 'cnst0' producing a result twice the size of input i.e. signed halfword. Multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ #define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0, \ ( v16i8 ) mult0, ( v16i8 ) cnst0 ); \ out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1, \ ( v16i8 ) mult1, ( v16i8 ) cnst1 ); \ } #define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ ) #define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3, \ cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 ) \ { \ DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \ DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \ } #define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ ) /* Description : Dot product of halfword vector elements Arguments : Inputs - mult0, mult1 cnst0, cnst1 Outputs - out0, out1 Return Type - as per RTYPE Details : Signed halfword elements from 'mult0' are multiplied with signed halfword elements from 'cnst0' producing a result twice the size of input i.e. signed word. Multiplication result of adjacent odd-even elements are added together and written to the 'out0' vector */ #define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0, \ ( v8i16 ) mult0, ( v8i16 ) cnst0 ); \ out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1, \ ( v8i16 ) mult1, ( v8i16 ) cnst1 ); \ } #define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ ) /* Description : Clips all halfword elements of input vector between min & max out = (in < min) ? min : ((in > max) ? max : in) Arguments : Inputs - in, min, max Output - out_m Return Type - signed halfword */ #define CLIP_SH( in, min, max ) \ ( { \ v8i16 out_m; \ \ out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in ); \ out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m ); \ out_m; \ } ) /* Description : Clips all signed halfword elements of input vector between 0 & 255 Arguments : Input - in Output - out_m Return Type - signed halfword */ #define CLIP_SH_0_255( in ) \ ( { \ v8i16 max_m = __msa_ldi_h( 255 ); \ v8i16 out_m; \ \ out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 ); \ out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m ); \ out_m; \ } ) #define CLIP_SH2_0_255( in0, in1 ) \ { \ in0 = CLIP_SH_0_255( in0 ); \ in1 = CLIP_SH_0_255( in1 ); \ } #define CLIP_SH4_0_255( in0, in1, in2, in3 ) \ { \ CLIP_SH2_0_255( in0, in1 ); \ CLIP_SH2_0_255( in2, in3 ); \ } /* Description : Horizontal addition of 4 signed word elements of input vector Arguments : Input - in (signed word vector) Output - sum_m (i32 sum) Return Type - signed word (GP) Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned */ #define HADD_SW_S32( in ) \ ( { \ v2i64 res0_m, res1_m; \ int32_t i_sum_m; \ \ res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in ); \ res1_m = __msa_splati_d( res0_m, 1 ); \ res0_m = res0_m + res1_m; \ i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 ); \ i_sum_m; \ } ) /* Description : Horizontal addition of 4 signed word elements of input vector Arguments : Input - in (signed word vector) Output - sum_m (i32 sum) Return Type - signed word (GP) Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned */ #define HADD_UH_U32( in ) \ ( { \ v4u32 res_m; \ v2u64 res0_m, res1_m; \ uint32_t u_sum_m; \ \ res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in ); \ res0_m = __msa_hadd_u_d( res_m, res_m ); \ res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 ); \ res0_m = res0_m + res1_m; \ u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 ); \ u_sum_m; \ } ) /* Description : Horizontal addition of signed byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Each signed odd byte element from 'in0' is added to even signed byte element from 'in0' (pairwise) and the halfword result is written in 'out0' */ #define HADD_SB2( RTYPE, in0, in1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 ); \ out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 ); \ } #define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ HADD_SB2( RTYPE, in0, in1, out0, out1 ); \ HADD_SB2( RTYPE, in2, in3, out2, out3 ); \ } #define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ ) /* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Each unsigned odd byte element from 'in0' is added to even unsigned byte element from 'in0' (pairwise) and the halfword result is written to 'out0' */ #define HADD_UB2( RTYPE, in0, in1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \ out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \ } #define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ ) #define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ HADD_UB2( RTYPE, in0, in1, out0, out1 ); \ HADD_UB2( RTYPE, in2, in3, out2, out3 ); \ } #define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ ) /* Description : Horizontal subtraction of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Each unsigned odd byte element from 'in0' is subtracted from even unsigned byte element from 'in0' (pairwise) and the halfword result is written to 'out0' */ #define HSUB_UB2( RTYPE, in0, in1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \ out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \ } #define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ ) #define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ HSUB_UB2( RTYPE, in0, in1, out0, out1 ); \ HSUB_UB2( RTYPE, in2, in3, out2, out3 ); \ } #define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ ) /* Description : SAD (Sum of Absolute Difference) Arguments : Inputs - in0, in1, ref0, ref1 Outputs - sad_m (halfword vector) Return Type - unsigned halfword Details : Absolute difference of all the byte elements from 'in0' with 'ref0' is calculated and preserved in 'diff0'. Then even-odd pairs are added together to generate 8 halfword results. */ #define SAD_UB2_UH( in0, in1, ref0, ref1 ) \ ( { \ v16u8 diff0_m, diff1_m; \ v8u16 sad_m = { 0 }; \ \ diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 ); \ diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 ); \ \ sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m ); \ sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m ); \ \ sad_m; \ } ) /* Description : Set element n input vector to GPR value Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) Output - out (output vector) Return Type - as per RTYPE Details : Set element 0 in vector 'out' to value specified in 'in0' */ #define INSERT_W2( RTYPE, in0, in1, out ) \ { \ out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \ out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \ } #define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ ) #define INSERT_W4( RTYPE, in0, in1, in2, in3, out ) \ { \ out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \ out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \ out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 ); \ out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 ); \ } #define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ ) #define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ ) #define INSERT_D2( RTYPE, in0, in1, out ) \ { \ out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 ); \ out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 ); \ } #define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ ) /* Description : Interleave even halfword elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Even halfword elements of 'in0' and 'in1' are interleaved and written to 'out0' */ #define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 ); \ out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 ); \ } #define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ ) /* Description : Interleave even double word elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Even double word elements of 'in0' and 'in1' are interleaved and written to 'out0' */ #define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 ); \ out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 ); \ } #define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ ) /* Description : Interleave left half of byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Left half of byte elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ #define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ } #define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ ) #define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ ) #define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ ) #define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ ) #define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ ) #define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ ) /* Description : Interleave left half of halfword elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Left half of halfword elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ #define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ } #define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ ) #define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ ) #define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ ) /* Description : Interleave left half of word elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Left half of word elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ #define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \ } #define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ ) /* Description : Interleave right half of byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of byte elements of 'in0' and 'in1' are interleaved and written to out0. */ #define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ } #define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ ) #define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ ) #define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ ) #define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ ) #define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ ) #define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ ) #define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ ) /* Description : Interleave right half of halfword elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of halfword elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ #define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ } #define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ ) #define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ ) #define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ ) #define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ ) #define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \ } #define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ ) /* Description : Interleave right half of double word elements from vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of double word elements of 'in0' and 'in1' are interleaved and written to 'out0'. */ #define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) ); \ out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) ); \ } #define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ ) #define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ ) #define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ ) #define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ ) /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of byte elements from 'in0' and 'in1' are interleaved and written to 'out0' */ #define ILVRL_B2( RTYPE, in0, in1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ } #define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ ) #define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ ) #define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ ) #define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ ) #define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ ) #define ILVRL_H2( RTYPE, in0, in1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ } #define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ ) #define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ ) #define ILVRL_W2( RTYPE, in0, in1, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ } #define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ ) #define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ ) /* Description : Maximum values between signed elements of vector and 5-bit signed immediate value are copied to the output vector Arguments : Inputs - in0, in1, in2, in3, max_val Outputs - in place operation Return Type - unsigned halfword Details : Maximum of signed halfword element values from 'in0' and 'max_val' are written in place */ #define MAXI_SH2( RTYPE, in0, in1, max_val ) \ { \ in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) ); \ in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) ); \ } #define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ ) #define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ ) #define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val ) \ { \ MAXI_SH2( RTYPE, in0, in1, max_val ); \ MAXI_SH2( RTYPE, in2, in3, max_val ); \ } #define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ ) /* Description : Saturate the halfword element values to the max unsigned value of (sat_val + 1 bits) The element data width remains unchanged Arguments : Inputs - in0, in1, sat_val Outputs - in place operation Return Type - as per RTYPE Details : Each unsigned halfword element from 'in0' is saturated to the value generated with (sat_val+1) bit range. The results are written in place */ #define SAT_UH2( RTYPE, in0, in1, sat_val ) \ { \ in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val ); \ in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val ); \ } #define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ ) #define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val ) \ { \ SAT_UH2( RTYPE, in0, in1, sat_val ); \ SAT_UH2( RTYPE, in2, in3, sat_val ) \ } #define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ ) /* Description : Saturate the halfword element values to the max unsigned value of (sat_val+1 bits) The element data width remains unchanged Arguments : Inputs - in0, in1, sat_val Outputs - in place operation Return Type - as per RTYPE Details : Each unsigned halfword element from 'in0' is saturated to the value generated with (sat_val+1) bit range The results are written in place */ #define SAT_SH2( RTYPE, in0, in1, sat_val ) \ { \ in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val ); \ in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val ); \ } #define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ ) #define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val ) \ { \ SAT_SH2( RTYPE, in0, in1, sat_val ); \ SAT_SH2( RTYPE, in2, in3, sat_val ); \ } #define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ ) /* Description : Saturate the word element values to the max unsigned value of (sat_val+1 bits) The element data width remains unchanged Arguments : Inputs - in0, in1, sat_val Outputs - in place operation Return Type - as per RTYPE Details : Each unsigned word element from 'in0' is saturated to the value generated with (sat_val+1) bit range The results are written in place */ #define SAT_SW2( RTYPE, in0, in1, sat_val ) \ { \ in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val ); \ in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val ); \ } #define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ ) /* Description : Pack even byte elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Even byte elements of 'in0' are copied to the left half of 'out0' & even byte elements of 'in1' are copied to the right half of 'out0'. */ #define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ } #define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ ) #define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ ) #define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ ) #define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ ) #define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \ { \ PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 ); \ } #define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ ) #define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ ) #define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ ) /* Description : Pack even halfword elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Even halfword elements of 'in0' are copied to the left half of 'out0' & even halfword elements of 'in1' are copied to the right half of 'out0'. */ #define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ } #define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ ) #define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ ) /* Description : Pack even double word elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Even double elements of 'in0' are copied to the left half of 'out0' & even double elements of 'in1' are copied to the right half of 'out0'. */ #define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \ out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \ } #define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ ) #define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ ) /* Description : Pack odd byte elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Odd byte elements of 'in0' are copied to the left half of 'out0' & odd byte elements of 'in1' are copied to the right half of 'out0'. */ #define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ } #define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ ) #define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ ) /* Description : Pack odd double word elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Odd double word elements of 'in0' are copied to the left half of 'out0' & odd double word elements of 'in1' are copied to the right half of 'out0'. */ #define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \ out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \ } #define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ ) #define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ ) /* Description : Each byte element is logically xor'ed with immediate 128 Arguments : Inputs - in0, in1 Outputs - in place operation Return Type - as per RTYPE Details : Each unsigned byte element from input vector 'in0' is logically xor'ed with 128 and the result is stored in-place. */ #define XORI_B2_128( RTYPE, in0, in1 ) \ { \ in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 ); \ in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 ); \ } #define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ ) #define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ ) #define XORI_B3_128( RTYPE, in0, in1, in2 ) \ { \ XORI_B2_128( RTYPE, in0, in1 ); \ in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 ); \ } #define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ ) #define XORI_B4_128( RTYPE, in0, in1, in2, in3 ) \ { \ XORI_B2_128( RTYPE, in0, in1 ); \ XORI_B2_128( RTYPE, in2, in3 ); \ } #define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ ) #define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ ) #define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 ) \ { \ XORI_B3_128( RTYPE, in0, in1, in2 ); \ XORI_B2_128( RTYPE, in3, in4 ); \ } #define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ ) /* Description : Addition of signed halfword elements and signed saturation Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE Details : Signed halfword elements from 'in0' are added to signed halfword elements of 'in1'. The result is then signed saturated between halfword data type range */ #define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ { \ out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ } #define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ ) #define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ } #define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ ) /* Description : Shift left all elements of vector (generic for all data types) Arguments : Inputs - in0, in1, in2, in3, shift Outputs - in place operation Return Type - as per input vector RTYPE Details : Each element of vector 'in0' is left shifted by 'shift' and the result is written in-place. */ #define SLLI_4V( in0, in1, in2, in3, shift ) \ { \ in0 = in0 << shift; \ in1 = in1 << shift; \ in2 = in2 << shift; \ in3 = in3 << shift; \ } /* Description : Arithmetic shift right all elements of vector (generic for all data types) Arguments : Inputs - in0, in1, in2, in3, shift Outputs - in place operation Return Type - as per input vector RTYPE Details : Each element of vector 'in0' is right shifted by 'shift' and the result is written in-place. 'shift' is a GP variable. */ #define SRA_4V( in0, in1, in2, in3, shift ) \ { \ in0 = in0 >> shift; \ in1 = in1 >> shift; \ in2 = in2 >> shift; \ in3 = in3 >> shift; \ } /* Description : Shift right arithmetic rounded halfwords Arguments : Inputs - in0, in1, shift Outputs - in place operation Return Type - as per RTYPE Details : Each element of vector 'in0' is shifted right arithmetic by number of bits respective element holds in vector 'shift'. The last discarded bit is added to shifted value for rounding and the result is written in-place. 'shift' is a vector. */ #define SRAR_H2( RTYPE, in0, in1, shift ) \ { \ in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift ); \ in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift ); \ } #define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ ) #define SRAR_H4( RTYPE, in0, in1, in2, in3, shift ) \ { \ SRAR_H2( RTYPE, in0, in1, shift ) \ SRAR_H2( RTYPE, in2, in3, shift ) \ } #define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ ) /* Description : Shift right logical all halfword elements of vector Arguments : Inputs - in0, in1, in2, in3, shift Outputs - in place operation Return Type - as per RTYPE Details : Each element of vector 'in0' is shifted right logical by number of bits respective element holds in vector 'shift' and the result is stored in-place.'shift' is a vector. */ #define SRL_H4( RTYPE, in0, in1, in2, in3, shift ) \ { \ in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift ); \ in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift ); \ in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift ); \ in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift ); \ } #define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ ) /* Description : Shift right arithmetic rounded (immediate) Arguments : Inputs - in0, in1, shift Outputs - in place operation Return Type - as per RTYPE Details : Each element of vector 'in0' is shifted right arithmetic by value in 'shift'. The last discarded bit is added to shifted value for rounding and the result is written in-place. 'shift' is an immediate value. */ #define SRARI_H2( RTYPE, in0, in1, shift ) \ { \ in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift ); \ in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift ); \ } #define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ ) #define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ ) #define SRARI_H4( RTYPE, in0, in1, in2, in3, shift ) \ { \ SRARI_H2( RTYPE, in0, in1, shift ); \ SRARI_H2( RTYPE, in2, in3, shift ); \ } #define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ ) #define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ ) #define SRARI_W2( RTYPE, in0, in1, shift ) \ { \ in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift ); \ in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift ); \ } #define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ ) #define SRARI_W4( RTYPE, in0, in1, in2, in3, shift ) \ { \ SRARI_W2( RTYPE, in0, in1, shift ); \ SRARI_W2( RTYPE, in2, in3, shift ); \ } #define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ ) /* Description : Multiplication of pairs of vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Details : Each element from 'in0' is multiplied with elements from 'in1' and the result is written to 'out0' */ #define MUL2( in0, in1, in2, in3, out0, out1 ) \ { \ out0 = in0 * in1; \ out1 = in2 * in3; \ } #define MUL4( in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ MUL2( in0, in1, in2, in3, out0, out1 ); \ MUL2( in4, in5, in6, in7, out2, out3 ); \ } /* Description : Addition of 2 pairs of vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Details : Each element in 'in0' is added to 'in1' and result is written to 'out0'. */ #define ADD2( in0, in1, in2, in3, out0, out1 ) \ { \ out0 = in0 + in1; \ out1 = in2 + in3; \ } #define ADD4( in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ ADD2( in0, in1, in2, in3, out0, out1 ); \ ADD2( in4, in5, in6, in7, out2, out3 ); \ } #define SUB4( in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3 ) \ { \ out0 = in0 - in1; \ out1 = in2 - in3; \ out2 = in4 - in5; \ out3 = in6 - in7; \ } /* Description : Sign extend halfword elements from right half of the vector Arguments : Input - in (halfword vector) Output - out (sign extended word vector) Return Type - signed word Details : Sign bit of halfword elements from input vector 'in' is extracted and interleaved with same vector 'in0' to generate 4 word elements keeping sign intact */ #define UNPCK_R_SH_SW( in, out ) \ { \ v8i16 sign_m; \ \ sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \ out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in ); \ } /* Description : Zero extend unsigned byte elements to halfword elements Arguments : Input - in (unsigned byte vector) Outputs - out0, out1 (unsigned halfword vectors) Return Type - signed halfword Details : Zero extended right half of vector is returned in 'out0' Zero extended left half of vector is returned in 'out1' */ #define UNPCK_UB_SH( in, out0, out1 ) \ { \ v16i8 zero_m = { 0 }; \ \ ILVRL_B2_SH( zero_m, in, out0, out1 ); \ } /* Description : Sign extend halfword elements from input vector and return the result in pair of vectors Arguments : Input - in (halfword vector) Outputs - out0, out1 (sign extended word vectors) Return Type - signed word Details : Sign bit of halfword elements from input vector 'in' is extracted and interleaved right with same vector 'in0' to generate 4 signed word elements in 'out0' Then interleaved left with same vector 'in0' to generate 4 signed word elements in 'out1' */ #define UNPCK_SH_SW( in, out0, out1 ) \ { \ v8i16 tmp_m; \ \ tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \ ILVRL_H2_SW( tmp_m, in, out0, out1 ); \ } /* Description : Butterfly of 4 input vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Details : Butterfly operation */ #define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ out0 = in0 + in3; \ out1 = in1 + in2; \ \ out2 = in1 - in2; \ out3 = in0 - in3; \ } /* Description : Butterfly of 8 input vectors Arguments : Inputs - in0 ... in7 Outputs - out0 .. out7 Details : Butterfly operation */ #define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ out0 = in0 + in7; \ out1 = in1 + in6; \ out2 = in2 + in5; \ out3 = in3 + in4; \ \ out4 = in3 - in4; \ out5 = in2 - in5; \ out6 = in1 - in6; \ out7 = in0 - in7; \ } /* Description : Transpose input 8x8 byte block Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - as per RTYPE */ #define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ \ ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5, \ tmp0_m, tmp1_m, tmp2_m, tmp3_m ); \ ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m ); \ ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m ); \ ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 ); \ ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 ); \ SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 ); \ SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 ); \ } #define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ ) /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - unsigned byte */ #define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ \ ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 ); \ ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 ); \ ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 ); \ ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 ); \ \ tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \ tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \ tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \ tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \ out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \ tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \ out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \ tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \ \ ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m ); \ out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ \ tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \ tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \ out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ \ ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m ); \ out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ \ tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \ tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \ tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \ tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \ out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ } /* Description : Transpose 4x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - signed halfword */ #define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ v8i16 s0_m, s1_m; \ \ ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m ); \ ILVRL_W2_SH( s1_m, s0_m, out0, out2 ); \ out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \ out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 ); \ } /* Description : Transpose 4x8 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - signed halfword */ #define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ v8i16 zero_m = { 0 }; \ \ ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6, \ tmp0_n, tmp1_n, tmp2_n, tmp3_n ); \ ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m ); \ ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m ); \ \ out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \ out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \ out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \ out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \ \ out4 = zero_m; \ out5 = zero_m; \ out6 = zero_m; \ out7 = zero_m; \ } /* Description : Transpose 8x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - signed halfword */ #define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ \ ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m ); \ ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m ); \ ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 ); \ ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 ); \ } /* Description : Transpose 8x8 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - as per RTYPE */ #define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7 ) \ { \ v8i16 s0_m, s1_m; \ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ \ ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \ ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m ); \ ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \ ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m ); \ ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \ ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m ); \ ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \ ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m ); \ PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ tmp3_m, tmp7_m, out0, out2, out4, out6 ); \ out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \ out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \ out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \ out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \ } #define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ ) /* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - signed word */ #define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 ) \ { \ v4i32 s0_m, s1_m, s2_m, s3_m; \ \ ILVRL_W2_SW( in1, in0, s0_m, s1_m ); \ ILVRL_W2_SW( in3, in2, s2_m, s3_m ); \ \ out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \ out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \ out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \ out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \ } /* Description : Add block 4x4 Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Least significant 4 bytes from each input vector are added to the destination bytes, clipped between 0-255 and stored. */ #define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \ { \ uint32_t src0_m, src1_m, src2_m, src3_m; \ uint32_t out0_m, out1_m, out2_m, out3_m; \ v8i16 inp0_m, inp1_m, res0_m, res1_m; \ v16i8 dst0_m = { 0 }; \ v16i8 dst1_m = { 0 }; \ v16i8 zero_m = { 0 }; \ \ ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m ) \ LW4( p_dst, stride, src0_m, src1_m, src2_m, src3_m ); \ INSERT_W2_SB( src0_m, src1_m, dst0_m ); \ INSERT_W2_SB( src2_m, src3_m, dst1_m ); \ ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m ); \ ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m ); \ CLIP_SH2_0_255( res0_m, res1_m ); \ PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m ); \ \ out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 ); \ out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 ); \ out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 ); \ out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 ); \ SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \ } /* Description : Dot product and addition of 3 signed halfword input vectors Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 Output - out0_m Return Type - signed halfword Details : Dot product of 'in0' with 'coeff0' Dot product of 'in1' with 'coeff1' Dot product of 'in2' with 'coeff2' Addition of all the 3 vector results out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) */ #define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 ) \ ( { \ v8i16 tmp1_m; \ v8i16 out0_m; \ \ out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 ); \ out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 ); \ tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 ); \ out0_m = __msa_adds_s_h( out0_m, tmp1_m ); \ \ out0_m; \ } ) /* Description : Pack even elements of input vectors & xor with 128 Arguments : Inputs - in0, in1 Output - out_m Return Type - unsigned byte Details : Signed byte even elements from 'in0' and 'in1' are packed together in one vector and the resulting vector is xor'ed with 128 to shift the range from signed to unsigned byte */ #define PCKEV_XORI128_UB( in0, in1 ) \ ( { \ v16u8 out_m; \ out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \ out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 ); \ out_m; \ } ) /* Description : Pack even byte elements, extract 0 & 2 index words from pair of results and store 4 words in destination memory as per stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride */ #define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \ { \ uint32_t out0_m, out1_m, out2_m, out3_m; \ v16i8 tmp0_m, tmp1_m; \ \ PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m ); \ \ out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 ); \ out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 ); \ out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 ); \ out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 ); \ \ SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \ } /* Description : Pack even byte elements and store byte vector in destination memory Arguments : Inputs - in0, in1, pdst */ #define PCKEV_ST_SB( in0, in1, p_dst ) \ { \ v16i8 tmp_m; \ tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \ ST_SB( tmp_m, ( p_dst ) ); \ } #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 ) \ ( { \ v4i32 tmp0_m, tmp1_m; \ v8i16 out0_m, out1_m, out2_m, out3_m; \ v8i16 minus5h_m = __msa_ldi_h( -5 ); \ v8i16 plus20h_m = __msa_ldi_h( 20 ); \ \ ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m ); \ \ tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m ); \ tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m ); \ \ ILVRL_H2_SH( in1, in4, out0_m, out1_m ); \ DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m ); \ ILVRL_H2_SH( in2, in3, out2_m, out3_m ); \ DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m ); \ \ SRARI_W2_SW( tmp0_m, tmp1_m, 10 ); \ SAT_SW2_SW( tmp0_m, tmp1_m, 7 ); \ out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \ \ out0_m; \ } ) #define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 ) \ ( { \ v8i16 out0_m, out1_m; \ v16i8 tmp0_m, tmp1_m; \ v16i8 minus5b = __msa_ldi_b( -5 ); \ v16i8 plus20b = __msa_ldi_b( 20 ); \ \ tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in ); \ out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m ); \ \ tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in ); \ out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m ); \ \ tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in ); \ out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m ); \ \ out1_m; \ } ) #endif /* X264_MIPS_MACROS_H */ x264-master/common/mips/mc-c.c000066400000000000000000004522701502133446700163140ustar00rootroot00000000000000/***************************************************************************** * mc-c.c: msa motion compensation ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Neha Rana * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macros.h" #include "mc.h" #if !HIGH_BIT_DEPTH static const uint8_t pu_luma_mask_arr[16 * 8] = { /* 8 width cases */ 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, /* 4 width cases */ 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24, 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23, 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26 }; static const uint8_t pu_chroma_mask_arr[16 * 5] = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20 }; static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { uint32_t u_loop_cnt, u_h4w; v16u8 dst0; v16i8 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v16i8 mask0, mask1, mask2; v16i8 vec0, vec1, vec2, vec3, vec4, vec5; v16i8 vec6, vec7, vec8, vec9, vec10, vec11; v16i8 minus5b = __msa_ldi_b( -5 ); v16i8 plus20b = __msa_ldi_b( 20 ); u_h4w = i_height % 4; LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 ); for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; ) { LD_SB2( p_src, 8, src0, src1 ); p_src += i_src_stride; LD_SB2( p_src, 8, src2, src3 ); p_src += i_src_stride; XORI_B4_128_SB( src0, src1, src2, src3 ); VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 ); VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 ); VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 ); VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 ); VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 ); VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 ); HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 ); DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, minus5b, res0, res1, res2, res3 ); DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, plus20b, res0, res1, res2, res3 ); LD_SB2( p_src, 8, src4, src5 ); p_src += i_src_stride; LD_SB2( p_src, 8, src6, src7 ); p_src += i_src_stride; XORI_B4_128_SB( src4, src5, src6, src7 ); VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 ); VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 ); VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 ); VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 ); VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 ); VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 ); HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 ); DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, minus5b, res4, res5, res6, res7 ); DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, plus20b, res4, res5, res6, res7 ); SRARI_H4_SH( res0, res1, res2, res3, 5 ); SRARI_H4_SH( res4, res5, res6, res7, 5 ); SAT_SH4_SH( res0, res1, res2, res3, 7 ); SAT_SH4_SH( res4, res5, res6, res7, 7 ); PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1, vec2, vec3 ); XORI_B4_128_SB( vec0, vec1, vec2, vec3 ); ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } for( u_loop_cnt = u_h4w; u_loop_cnt--; ) { LD_SB2( p_src, 8, src0, src1 ); p_src += i_src_stride; XORI_B2_128_SB( src0, src1 ); VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 ); VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 ); VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 ); res0 = __msa_hadd_s_h( vec0, vec0 ); DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 ); res1 = __msa_hadd_s_h( vec3, vec3 ); DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 ); SRARI_H2_SH( res0, res1, 5 ); SAT_SH2_SH( res0, res1, 7 ); dst0 = PCKEV_XORI128_UB( res0, res1 ); ST_UB( dst0, p_dst ); p_dst += i_dst_stride; } } static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { uint32_t u_loop_cnt, u_h4w; const int16_t i_filt_const0 = 0xfb01; const int16_t i_filt_const1 = 0x1414; const int16_t i_filt_const2 = 0x1fb; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; v16i8 src65_l, src87_l; v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; v16u8 res0, res1, res2, res3; v16i8 filt0, filt1, filt2; u_h4w = i_height % 4; filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 ); filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 ); filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 ); LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 ); p_src += ( 5 * i_src_stride ); XORI_B5_128_SB( src0, src1, src2, src3, src4 ); ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, src32_r, src43_r ); ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l, src32_l, src43_l ); for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; ) { LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 ); p_src += ( 4 * i_src_stride ); XORI_B4_128_SB( src5, src6, src7, src8 ); ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, src76_r, src87_r ); ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l, src76_l, src87_l ); out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r, filt0, filt1, filt2 ); out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r, filt0, filt1, filt2 ); out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r, filt0, filt1, filt2 ); out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r, filt0, filt1, filt2 ); out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l, filt0, filt1, filt2 ); out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l, filt0, filt1, filt2 ); out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l, filt0, filt1, filt2 ); out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l, filt0, filt1, filt2 ); SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 ); SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 ); SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 ); SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 ); PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, res0, res1, res2, res3 ); XORI_B4_128_UB( res0, res1, res2, res3 ); ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); src10_r = src54_r; src32_r = src76_r; src21_r = src65_r; src43_r = src87_r; src10_l = src54_l; src32_l = src76_l; src21_l = src65_l; src43_l = src87_l; src4 = src8; } for( u_loop_cnt = u_h4w; u_loop_cnt--; ) { src5 = LD_SB( p_src ); p_src += ( i_src_stride ); src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 ); ILVRL_B2_SB( src5, src4, src54_r, src54_l ); out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r, filt0, filt1, filt2 ); out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l, filt0, filt1, filt2 ); SRARI_H2_SH( out0_r, out0_l, 5 ); SAT_SH2_SH( out0_r, out0_l, 7 ); out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r ); res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 ); ST_UB( res0, p_dst ); p_dst += i_dst_stride; src10_r = src21_r; src21_r = src32_r; src32_r = src43_r; src43_r = src54_r; src10_l = src21_l; src21_l = src32_l; src32_l = src43_l; src43_l = src54_l; src4 = src5; } } static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { uint32_t u_loop_cnt, u_h4w; uint64_t u_out0; v16i8 tmp0; v16i8 src0, src1, src2, src3, src4; v16i8 mask0, mask1, mask2; v8i16 hz_out0, hz_out1, hz_out2, hz_out3; v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; v8i16 dst0, dst1, dst2, dst3; v16u8 out0, out1; u_h4w = i_height % 4; LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 ); LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 ); XORI_B5_128_SB( src0, src1, src2, src3, src4 ); p_src += ( 5 * i_src_stride ); hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 ); hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 ); hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 ); hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 ); hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 ); for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; ) { LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 ); XORI_B4_128_SB( src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 ); hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 ); hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 ); hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 ); dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5 ); dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6 ); dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4, hz_out5, hz_out6, hz_out7 ); dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5, hz_out6, hz_out7, hz_out8 ); out0 = PCKEV_XORI128_UB( dst0, dst1 ); out1 = PCKEV_XORI128_UB( dst2, dst3 ); ST8x4_UB( out0, out1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); hz_out3 = hz_out7; hz_out1 = hz_out5; hz_out5 = hz_out4; hz_out4 = hz_out8; hz_out2 = hz_out6; hz_out0 = hz_out5; } for( u_loop_cnt = u_h4w; u_loop_cnt--; ) { src0 = LD_SB( p_src ); p_src += i_src_stride; src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 ); hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 ); dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5 ); tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) ); tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 ); u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 ); SD( u_out0, p_dst ); p_dst += i_dst_stride; hz_out0 = hz_out1; hz_out1 = hz_out2; hz_out2 = hz_out3; hz_out3 = hz_out4; hz_out4 = hz_out5; } } static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { uint32_t u_multiple8_cnt; for( u_multiple8_cnt = 2; u_multiple8_cnt--; ) { avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); p_src += 8; p_dst += 8; } } static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1 ) { uint16_t u_out0, u_out1, u_out2, u_out3; v16u8 src0, src1, src2, src3, src4; v8u16 res_hz0, res_hz1, res_hz2, res_hz3; v8u16 res_vt0, res_vt1, res_vt2, res_vt3; v16i8 mask; v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); v8i16 res0, res1; mask = LD_SB( &pu_chroma_mask_arr[16] ); LD_UB3( p_src, i_src_stride, src0, src1, src2 ); VSHF_B2_UB( src0, src1, src1, src2, ( mask + 1 ), ( mask + 1 ), src3, src4 ); VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3 ); MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 ); SRARI_H2_UH( res_vt0, res_vt2, 6 ); SAT_UH2_UH( res_vt0, res_vt2, 7 ); PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 ); u_out0 = __msa_copy_u_h( res0, 0 ); u_out1 = __msa_copy_u_h( res0, 2 ); u_out2 = __msa_copy_u_h( res1, 0 ); u_out3 = __msa_copy_u_h( res1, 2 ); SH( u_out0, p_dst_u ); p_dst_u += i_dst_stride; SH( u_out1, p_dst_u ); SH( u_out2, p_dst_v ); p_dst_v += i_dst_stride; SH( u_out3, p_dst_v ); } static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1 ) { uint16_t u_out0, u_out1, u_out2, u_out3; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v8u16 res_hz0, res_hz1, res_hz2, res_hz3; v8u16 res_vt0, res_vt1, res_vt2, res_vt3; v16i8 mask; v8i16 res0, res1; v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); mask = LD_SB( &pu_chroma_mask_arr[16] ); LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 ); VSHF_B2_UB( src0, src1, src1, src2, ( mask + 1 ), ( mask + 1 ), src5, src6 ); VSHF_B2_UB( src2, src3, src3, src4, ( mask + 1 ), ( mask + 1 ), src7, src8 ); VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 ); DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3 ); MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); SRARI_H2_UH( res_vt0, res_vt1, 6 ); SAT_UH2_UH( res_vt0, res_vt1, 7 ); PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); u_out0 = __msa_copy_u_h( res0, 0 ); u_out1 = __msa_copy_u_h( res0, 2 ); u_out2 = __msa_copy_u_h( res1, 0 ); u_out3 = __msa_copy_u_h( res1, 2 ); SH( u_out0, p_dst_u ); p_dst_u += i_dst_stride; SH( u_out1, p_dst_u ); p_dst_u += i_dst_stride; SH( u_out2, p_dst_u ); p_dst_u += i_dst_stride; SH( u_out3, p_dst_u ); DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3 ); MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); SRARI_H2_UH( res_vt0, res_vt1, 6 ); SAT_UH2_UH( res_vt0, res_vt1, 7 ); PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); u_out0 = __msa_copy_u_h( res0, 0 ); u_out1 = __msa_copy_u_h( res0, 2 ); u_out2 = __msa_copy_u_h( res1, 0 ); u_out3 = __msa_copy_u_h( res1, 2 ); SH( u_out0, p_dst_v ); p_dst_v += i_dst_stride; SH( u_out1, p_dst_v ); p_dst_v += i_dst_stride; SH( u_out2, p_dst_v ); p_dst_v += i_dst_stride; SH( u_out3, p_dst_v ); } static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1, int32_t i_height ) { if( 2 == i_height ) { avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, u_coef_hor0, u_coef_hor1, u_coef_ver0, u_coef_ver1 ); } else if( 4 == i_height ) { avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, u_coef_hor0, u_coef_hor1, u_coef_ver0, u_coef_ver1 ); } } static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1 ) { uint32_t u_out0, u_out1, u_out2, u_out3; v16u8 src0, src1, src2, src3, src4; v8u16 res_hz0, res_hz1, res_hz2, res_hz3; v8u16 res_vt0, res_vt1, res_vt2, res_vt3; v16i8 mask; v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); v4i32 res0, res1; mask = LD_SB( &pu_chroma_mask_arr[16] ); LD_UB3( p_src, i_src_stride, src0, src1, src2 ); VSHF_B2_UB( src0, src1, src1, src2, ( mask + 1 ), ( mask + 1 ), src3, src4 ); VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3 ); MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 ); SRARI_H2_UH( res_vt0, res_vt2, 6 ); SAT_UH2_UH( res_vt0, res_vt2, 7 ); PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 ); u_out0 = __msa_copy_u_w( res0, 0 ); u_out1 = __msa_copy_u_w( res0, 1 ); u_out2 = __msa_copy_u_w( res1, 0 ); u_out3 = __msa_copy_u_w( res1, 1 ); SW( u_out0, p_dst_u ); p_dst_u += i_dst_stride; SW( u_out1, p_dst_u ); SW( u_out2, p_dst_v ); p_dst_v += i_dst_stride; SW( u_out3, p_dst_v ); } static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1, int32_t i_height ) { uint32_t u_row; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v8u16 res_hz0, res_hz1, res_hz2, res_hz3; v8u16 res_vt0, res_vt1, res_vt2, res_vt3; v16i8 mask; v4i32 res0, res1; v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); mask = LD_SB( &pu_chroma_mask_arr[16] ); src0 = LD_UB( p_src ); p_src += i_src_stride; for( u_row = ( i_height >> 2 ); u_row--; ) { LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 ); p_src += ( 4 * i_src_stride ); VSHF_B2_UB( src0, src1, src1, src2, ( mask + 1 ), ( mask + 1 ), src5, src6 ); VSHF_B2_UB( src2, src3, src3, src4, ( mask + 1 ), ( mask + 1 ), src7, src8 ); VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 ); DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3 ); MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); SRARI_H2_UH( res_vt0, res_vt1, 6 ); SAT_UH2_UH( res_vt0, res_vt1, 7 ); PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride ); p_dst_u += ( 4 * i_dst_stride ); DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3 ); MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); SRARI_H2_UH( res_vt0, res_vt1, 6 ); SAT_UH2_UH( res_vt0, res_vt1, 7 ); PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride ); p_dst_v += ( 4 * i_dst_stride ); src0 = src4; } } static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1, int32_t i_height ) { if( 2 == i_height ) { avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, u_coef_hor0, u_coef_hor1, u_coef_ver0, u_coef_ver1 ); } else { avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, u_coef_hor0, u_coef_hor1, u_coef_ver0, u_coef_ver1, i_height ); } } static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst_u, uint8_t *p_dst_v, int32_t i_dst_stride, uint32_t u_coef_hor0, uint32_t u_coef_hor1, uint32_t u_coef_ver0, uint32_t u_coef_ver1, int32_t i_height ) { uint32_t u_row; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; v16u8 src10, src11, src12, src13, src14; v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5; v8u16 res_vt0, res_vt1, res_vt2, res_vt3; v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 }; v16i8 coeff_hz_vec0, coeff_hz_vec1; v16i8 tmp0, tmp1; v16u8 coeff_hz_vec; v8u16 coeff_vt_vec0, coeff_vt_vec1; coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); LD_UB2( p_src, 16, src0, src13 ); p_src += i_src_stride; VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 ); DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 ); for( u_row = ( i_height >> 2 ); u_row--; ) { LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 ); LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 ); p_src += ( 4 * i_src_stride ); VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 ); VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 ); DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4 ); MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); res_vt0 += ( res_hz0 * coeff_vt_vec1 ); res_vt1 += ( res_hz1 * coeff_vt_vec1 ); res_vt2 += ( res_hz2 * coeff_vt_vec1 ); res_vt3 += ( res_hz3 * coeff_vt_vec1 ); SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 ); SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 ); PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 ); ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride ); p_dst_u += ( 4 * i_dst_stride ); res_hz0 = res_hz4; VSHF_B2_UB( src1, src5, src2, src6, ( mask + 1 ), ( mask + 1 ), src5, src6 ); VSHF_B2_UB( src3, src7, src4, src8, ( mask + 1 ), ( mask + 1 ), src7, src8 ); DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4 ); MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3 ); res_vt0 += ( res_hz5 * coeff_vt_vec1 ); res_vt1 += ( res_hz1 * coeff_vt_vec1 ); res_vt2 += ( res_hz2 * coeff_vt_vec1 ); res_vt3 += ( res_hz3 * coeff_vt_vec1 ); SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 ); SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 ); PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 ); ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride ); p_dst_v += ( 4 * i_dst_stride ); res_hz5 = res_hz4; } } static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_log2_denom, int32_t i_weight, int32_t i_offset_in ) { uint32_t u_load0, u_load1, u_out0, u_out1; v16u8 zero = { 0 }; v16u8 src0, src1; v4i32 dst0, dst1; v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1; v8i16 vec0, vec1; i_offset_in <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); u_load0 = LW( p_src ); p_src += i_src_stride; u_load1 = LW( p_src ); src0 = ( v16u8 ) __msa_fill_w( u_load0 ); src1 = ( v16u8 ) __msa_fill_w( u_load1 ); ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 ); MUL2( wgt, temp0, wgt, temp1, temp0, temp1 ); ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 ); MAXI_SH2_SH( vec0, vec1, 0 ); tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom ); tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom ); SAT_UH2_UH( tp0, tp1, 7 ); PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 ); u_out0 = __msa_copy_u_w( dst0, 0 ); u_out1 = __msa_copy_u_w( dst1, 0 ); SW( u_out0, p_dst ); p_dst += i_dst_stride; SW( u_out1, p_dst ); } static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_weight, int32_t i_offset_in ) { uint8_t u_cnt; uint32_t u_load0, u_load1, u_load2, u_load3; v16u8 zero = { 0 }; v16u8 src0, src1, src2, src3; v8u16 temp0, temp1, temp2, temp3; v8u16 wgt, denom, offset; i_offset_in <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); for( u_cnt = i_height / 4; u_cnt--; ) { LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 ); p_src += 4 * i_src_stride; src0 = ( v16u8 ) __msa_fill_w( u_load0 ); src1 = ( v16u8 ) __msa_fill_w( u_load1 ); src2 = ( v16u8 ) __msa_fill_w( u_load2 ); src3 = ( v16u8 ) __msa_fill_w( u_load3 ); ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, temp0, temp1, temp2, temp3 ); MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, temp0, temp1, temp2, temp3 ); ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset, temp0, temp1, temp2, temp3 ); MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 ); SRL_H4_UH( temp0, temp1, temp2, temp3, denom ); SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 ); PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_weight, int32_t i_offset_in ) { if( 2 == i_height ) { avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_log2_denom, i_weight, i_offset_in ); } else { avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height, i_log2_denom, i_weight, i_offset_in ); } } static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_weight, int32_t i_offset_in ) { uint8_t u_cnt; v16u8 zero = { 0 }; v16u8 src0, src1, src2, src3; v8u16 temp0, temp1, temp2, temp3; v8u16 wgt, denom, offset; v16i8 out0, out1; i_offset_in <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); for( u_cnt = i_height / 4; u_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += 4 * i_src_stride; ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, temp0, temp1, temp2, temp3 ); MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, temp0, temp1, temp2, temp3 ); ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset, temp0, temp1, temp2, temp3 ); MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 ); SRL_H4_UH( temp0, temp1, temp2, temp3, denom ); SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 ); PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 ); ST8x4_UB( out0, out1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_weight, int32_t i_offset_in ) { uint8_t u_cnt; v16i8 zero = { 0 }; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v8u16 wgt, denom, offset; i_offset_in <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); for( u_cnt = i_height / 4; u_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += 4 * i_src_stride; ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, temp0, temp2, temp4, temp6 ); ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, temp1, temp3, temp5, temp7 ); MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, temp0, temp1, temp2, temp3 ); MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7, temp4, temp5, temp6, temp7 ); ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset, temp0, temp1, temp2, temp3 ); ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset, temp4, temp5, temp6, temp7 ); MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 ); MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 ); SRL_H4_UH( temp0, temp1, temp2, temp3, denom ); SRL_H4_UH( temp4, temp5, temp6, temp7, denom ); SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 ); SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 ); PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, dst0, dst1, dst2, dst3 ); ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride ); p_dst += 4 * i_dst_stride; } } static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint32_t u_load0, u_load1, u_out0, u_out1; v8i16 src1_wgt, src2_wgt; v16u8 in0, in1, in2, in3; v8i16 temp0, temp1, temp2, temp3; v16i8 zero = { 0 }; v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); src1_wgt = __msa_fill_h( i_src1_weight ); src2_wgt = __msa_fill_h( i_src2_weight ); u_load0 = LW( p_src1_in ); u_load1 = LW( p_src1_in + i_src1_stride ); in0 = ( v16u8 ) __msa_fill_w( u_load0 ); in1 = ( v16u8 ) __msa_fill_w( u_load1 ); u_load0 = LW( p_src2_in ); u_load1 = LW( p_src2_in + i_src2_stride ); in2 = ( v16u8 ) __msa_fill_w( u_load0 ); in3 = ( v16u8 ) __msa_fill_w( u_load1 ); ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3, temp0, temp1, temp2, temp3 ); temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt ); temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt ); SRAR_H2_SH( temp0, temp1, denom ); CLIP_SH2_0_255( temp0, temp1 ); PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 ); u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 ); u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 ); SW( u_out0, p_dst ); p_dst += i_dst_stride; SW( u_out1, p_dst ); } static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint8_t u_cnt; uint32_t u_load0, u_load1, u_load2, u_load3; v8i16 src1_wgt, src2_wgt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v16i8 zero = { 0 }; v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); src1_wgt = __msa_fill_h( i_src1_weight ); src2_wgt = __msa_fill_h( i_src2_weight ); for( u_cnt = i_height / 4; u_cnt--; ) { LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 ); p_src1_in += ( 4 * i_src1_stride ); src0 = ( v16u8 ) __msa_fill_w( u_load0 ); src1 = ( v16u8 ) __msa_fill_w( u_load1 ); src2 = ( v16u8 ) __msa_fill_w( u_load2 ); src3 = ( v16u8 ) __msa_fill_w( u_load3 ); LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 ); p_src2_in += ( 4 * i_src2_stride ); src4 = ( v16u8 ) __msa_fill_w( u_load0 ); src5 = ( v16u8 ) __msa_fill_w( u_load1 ); src6 = ( v16u8 ) __msa_fill_w( u_load2 ); src7 = ( v16u8 ) __msa_fill_w( u_load3 ); ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, temp0, temp1, temp2, temp3 ); ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, temp4, temp5, temp6, temp7 ); temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt ); temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt ); temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt ); temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt ); SRAR_H4_SH( temp0, temp1, temp2, temp3, denom ); CLIP_SH4_0_255( temp0, temp1, temp2, temp3 ); PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { if( 2 == i_height ) { avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride, p_src2_in, i_src2_stride, p_dst, i_dst_stride, i_log2_denom, i_src1_weight, i_src2_weight, i_offset_in ); } else { avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride, p_src2_in, i_src2_stride, p_dst, i_dst_stride, i_height, i_log2_denom, i_src1_weight, i_src2_weight, i_offset_in ); } } static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint8_t u_cnt; v8i16 src1_wgt, src2_wgt; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; v8i16 temp0, temp1, temp2, temp3; v8i16 res0, res1, res2, res3; v16i8 zero = { 0 }; v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); src1_wgt = __msa_fill_h( i_src1_weight ); src2_wgt = __msa_fill_h( i_src2_weight ); for( u_cnt = i_height / 4; u_cnt--; ) { LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); p_src1_in += ( 4 * i_src1_stride ); LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 ); p_src2_in += ( 4 * i_src2_stride ); ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, temp0, temp1, temp2, temp3 ); ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3, res0, res1, res2, res3 ); res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt ); res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt ); res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt ); res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt ); SRAR_H4_SH( res0, res1, res2, res3, denom ); CLIP_SH4_0_255( res0, res1, res2, res3 ); PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3, dst0, dst1, dst2, dst3 ); ST8x1_UB( dst0, p_dst ); p_dst += i_dst_stride; ST8x1_UB( dst1, p_dst ); p_dst += i_dst_stride; ST8x1_UB( dst2, p_dst ); p_dst += i_dst_stride; ST8x1_UB( dst3, p_dst ); p_dst += i_dst_stride; } } static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint8_t u_cnt; v8i16 src1_wgt, src2_wgt; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v16i8 zero = { 0 }; v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); src1_wgt = __msa_fill_h( i_src1_weight ); src2_wgt = __msa_fill_h( i_src2_weight ); for( u_cnt = i_height / 4; u_cnt--; ) { LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); p_src1_in += ( 4 * i_src1_stride ); LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 ); p_src2_in += ( 4 * i_src2_stride ); ILVRL_B2_SH( zero, src0, temp1, temp0 ); ILVRL_B2_SH( zero, src1, temp3, temp2 ); ILVRL_B2_SH( zero, src2, temp5, temp4 ); ILVRL_B2_SH( zero, src3, temp7, temp6 ); ILVRL_B2_SH( zero, dst0, res1, res0 ); ILVRL_B2_SH( zero, dst1, res3, res2 ); ILVRL_B2_SH( zero, dst2, res5, res4 ); ILVRL_B2_SH( zero, dst3, res7, res6 ); res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt ); res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt ); res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt ); res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt ); res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt ); res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt ); res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt ); res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt ); SRAR_H4_SH( res0, res1, res2, res3, denom ); SRAR_H4_SH( res4, res5, res6, res7, denom ); CLIP_SH4_0_255( res0, res1, res2, res3 ); CLIP_SH4_0_255( res4, res5, res6, res7 ); PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7, dst0, dst1, dst2, dst3 ); ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride ); p_dst += 4 * i_dst_stride; } } static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint32_t u_load0, u_load1, u_out0, u_out1; v16u8 src1_wgt, src2_wgt, wgt; v16i8 in0, in1, in2, in3; v8u16 temp0, temp1, denom, offset; i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); u_load0 = LW( p_src1_in ); u_load1 = LW( p_src1_in + i_src1_stride ); in0 = ( v16i8 ) __msa_fill_w( u_load0 ); in1 = ( v16i8 ) __msa_fill_w( u_load1 ); u_load0 = LW( p_src2_in ); u_load1 = LW( p_src2_in + i_src2_stride ); in2 = ( v16i8 ) __msa_fill_w( u_load0 ); in3 = ( v16i8 ) __msa_fill_w( u_load1 ); ILVR_B2_SB( in2, in0, in3, in1, in0, in1 ); temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 ); temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 ); temp0 >>= denom; temp1 >>= denom; MAXI_SH2_UH( temp0, temp1, 0 ); SAT_UH2_UH( temp0, temp1, 7 ); PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 ); u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 ); u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 ); SW( u_out0, p_dst ); p_dst += i_dst_stride; SW( u_out1, p_dst ); } static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint8_t u_cnt; uint32_t u_load0, u_load1, u_load2, u_load3; v16u8 src1_wgt, src2_wgt, wgt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 temp0, temp1, temp2, temp3; v8u16 res0, res1, res2, res3; v8u16 denom, offset; i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); for( u_cnt = i_height / 4; u_cnt--; ) { LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 ); p_src1_in += ( 4 * i_src1_stride ); src0 = ( v16u8 ) __msa_fill_w( u_load0 ); src1 = ( v16u8 ) __msa_fill_w( u_load1 ); src2 = ( v16u8 ) __msa_fill_w( u_load2 ); src3 = ( v16u8 ) __msa_fill_w( u_load3 ); LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 ); p_src2_in += ( 4 * i_src2_stride ); src4 = ( v16u8 ) __msa_fill_w( u_load0 ); src5 = ( v16u8 ) __msa_fill_w( u_load1 ); src6 = ( v16u8 ) __msa_fill_w( u_load2 ); src7 = ( v16u8 ) __msa_fill_w( u_load3 ); ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, temp0, temp1, temp2, temp3 ); DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt, res0, res1, res2, res3 ); ADD4( res0, offset, res1, offset, res2, offset, res3, offset, res0, res1, res2, res3 ); SRA_4V( res0, res1, res2, res3, denom ); MAXI_SH4_UH( res0, res1, res2, res3, 0 ); SAT_UH4_UH( res0, res1, res2, res3, 7 ); PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { if( 2 == i_height ) { avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride, p_src2_in, i_src2_stride, p_dst, i_dst_stride, i_log2_denom, i_src1_weight, i_src2_weight, i_offset_in ); } else { avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride, p_src2_in, i_src2_stride, p_dst, i_dst_stride, i_height, i_log2_denom, i_src1_weight, i_src2_weight, i_offset_in ); } } static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint8_t u_cnt; v16u8 src1_wgt, src2_wgt, wgt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 temp0, temp1, temp2, temp3; v8u16 res0, res1, res2, res3; v8u16 denom, offset; v16i8 out0, out1; i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); for( u_cnt = i_height / 4; u_cnt--; ) { LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); p_src1_in += ( 4 * i_src1_stride ); LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 ); p_src2_in += ( 4 * i_src2_stride ); ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, temp0, temp1, temp2, temp3 ); DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt, res0, res1, res2, res3 ); ADD4( res0, offset, res1, offset, res2, offset, res3, offset, res0, res1, res2, res3 ); SRA_4V( res0, res1, res2, res3, denom ); MAXI_SH4_UH( res0, res1, res2, res3, 0 ); SAT_UH4_UH( res0, res1, res2, res3, 7 ); PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 ); ST8x4_UB( out0, out1, p_dst, i_dst_stride ); p_dst += 4 * i_dst_stride; } } static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in, int32_t i_src1_stride, uint8_t *p_src2_in, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_log2_denom, int32_t i_src1_weight, int32_t i_src2_weight, int32_t i_offset_in ) { uint8_t u_cnt; v16u8 src1_wgt, src2_wgt, wgt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v8u16 res0, res1, res2, res3, res4, res5, res6, res7; v8u16 denom, offset; i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); offset = ( v8u16 ) __msa_fill_h( i_offset_in ); denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); for( u_cnt = i_height / 4; u_cnt--; ) { LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); p_src1_in += ( 4 * i_src1_stride ); LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 ); p_src2_in += ( 4 * i_src2_stride ); ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, temp0, temp2, temp4, temp6 ); ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, temp1, temp3, temp5, temp7 ); DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt, res0, res1, res2, res3 ); ADD4( res0, offset, res1, offset, res2, offset, res3, offset, res0, res1, res2, res3 ); DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt, res4, res5, res6, res7 ); ADD4( res4, offset, res5, offset, res6, offset, res7, offset, res4, res5, res6, res7 ); SRA_4V( res0, res1, res2, res3, denom ); SRA_4V( res4, res5, res6, res7, denom ); MAXI_SH4_UH( res0, res1, res2, res3, 0 ); MAXI_SH4_UH( res4, res5, res6, res7, 0 ); SAT_UH4_UH( res0, res1, res2, res3, 7 ); SAT_UH4_UH( res4, res5, res6, res7, 7 ); PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6, temp0, temp1, temp2, temp3 ); ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride ); p_dst += 4 * i_dst_stride; } } static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_cnt; uint32_t u_src0, u_src1; for( i_cnt = ( i_height / 2 ); i_cnt--; ) { u_src0 = LW( p_src ); p_src += i_src_stride; u_src1 = LW( p_src ); p_src += i_src_stride; SW( u_src0, p_dst ); p_dst += i_dst_stride; SW( u_src1, p_dst ); p_dst += i_dst_stride; } } static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_cnt; uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; if( 0 == i_height % 12 ) { for( i_cnt = ( i_height / 12 ); i_cnt--; ) { LD_UB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); p_src += ( 8 * i_src_stride ); u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 ); u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 ); u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 ); u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } else if( 0 == i_height % 8 ) { for( i_cnt = i_height >> 3; i_cnt--; ) { LD_UB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); p_src += ( 8 * i_src_stride ); u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 ); u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 ); u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 ); u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } else if( 0 == i_height % 4 ) { for( i_cnt = ( i_height / 4 ); i_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } else if( 0 == i_height % 2 ) { for( i_cnt = ( i_height / 2 ); i_cnt--; ) { LD_UB2( p_src, i_src_stride, src0, src1 ); p_src += ( 2 * i_src_stride ); u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); SD( u_out0, p_dst ); p_dst += i_dst_stride; SD( u_out1, p_dst ); p_dst += i_dst_stride; } } } static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height, int32_t i_width ) { int32_t i_cnt, i_loop_cnt; uint8_t *p_src_tmp, *p_dst_tmp; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; for( i_cnt = ( i_width >> 4 ); i_cnt--; ) { p_src_tmp = p_src; p_dst_tmp = p_dst; for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; ) { LD_UB8( p_src_tmp, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); p_src_tmp += ( 8 * i_src_stride ); ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7, p_dst_tmp, i_dst_stride ); p_dst_tmp += ( 8 * i_dst_stride ); } p_src += 16; p_dst += 16; } } static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; if( 0 == i_height % 12 ) { for( i_cnt = ( i_height / 12 ); i_cnt--; ) { LD_UB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); p_src += ( 8 * i_src_stride ); ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7, p_dst, i_dst_stride ); p_dst += ( 8 * i_dst_stride ); LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } else if( 0 == i_height % 8 ) { copy_16multx8mult_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height, 16 ); } else if( 0 == i_height % 4 ) { for( i_cnt = ( i_height >> 2 ); i_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } } static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride, uint8_t *p_src2, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_cnt; uint32_t u_out0, u_out1; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1; for( i_cnt = ( i_height / 2 ); i_cnt--; ) { LD_UB2( p_src1, i_src1_stride, src0, src1 ); p_src1 += ( 2 * i_src1_stride ); LD_UB2( p_src2, i_src2_stride, src2, src3 ); p_src2 += ( 2 * i_src2_stride ); AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 ); u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 ); u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 ); SW( u_out0, p_dst ); p_dst += i_dst_stride; SW( u_out1, p_dst ); p_dst += i_dst_stride; } } static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride, uint8_t *p_src2, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_cnt; uint64_t u_out0, u_out1, u_out2, u_out3; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 dst0, dst1, dst2, dst3; for( i_cnt = ( i_height / 4 ); i_cnt--; ) { LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 ); p_src1 += ( 4 * i_src1_stride ); LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 ); p_src2 += ( 4 * i_src2_stride ); AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7, dst0, dst1, dst2, dst3 ); u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 ); u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 ); u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 ); u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride, uint8_t *p_src2, int32_t i_src2_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_cnt; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; for( i_cnt = ( i_height / 8 ); i_cnt--; ) { LD_UB8( p_src1, i_src1_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); p_src1 += ( 8 * i_src1_stride ); LD_UB8( p_src2, i_src2_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); p_src2 += ( 8 * i_src2_stride ); AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, dst2, dst3 ); AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, dst6, dst7 ); ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, p_dst, i_dst_stride ); p_dst += ( 8 * i_dst_stride ); } } static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride, int32_t i_height ) { int8_t i_cnt; v16u8 zero = { 0 }; for( i_cnt = ( i_height / 2 ); i_cnt--; ) { ST_UB( zero, p_src ); p_src += i_stride; ST_UB( zero, p_src ); p_src += i_stride; } } static void core_plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride, uint8_t *p_src1, int32_t i_src1_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_width, int32_t i_height ) { int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3; v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3; i_w_mul8 = i_width - i_width % 8; i_h4w = i_height - i_height % 4; for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; ) { for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) { LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 ); LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 ); ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 ); ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 ); ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3, p_dst, i_dst_stride ); ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3, ( p_dst + 16 ), i_dst_stride ); p_src0 += 16; p_src1 += 16; p_dst += 32; } for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; ) { LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 ); LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 ); ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 ); ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3, p_dst, i_dst_stride ); p_src0 += 8; p_src1 += 8; p_dst += 16; } for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) { p_dst[0] = p_src0[0]; p_dst[1] = p_src1[0]; p_dst[i_dst_stride] = p_src0[i_src0_stride]; p_dst[i_dst_stride + 1] = p_src1[i_src1_stride]; p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride]; p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride]; p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride]; p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride]; p_src0 += 1; p_src1 += 1; p_dst += 2; } p_src0 += ( ( 4 * i_src0_stride ) - i_width ); p_src1 += ( ( 4 * i_src1_stride ) - i_width ); p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) ); } for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ ) { for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) { src0 = LD_UB( p_src0 ); src4 = LD_UB( p_src1 ); ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 ); ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 ); p_src0 += 16; p_src1 += 16; p_dst += 32; } for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; ) { src0 = LD_UB( p_src0 ); src4 = LD_UB( p_src1 ); vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4, ( v16i8 ) src0 ); ST_UB( vec_ilv_r0, p_dst ); p_src0 += 8; p_src1 += 8; p_dst += 16; } for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) { p_dst[0] = p_src0[0]; p_dst[1] = p_src1[0]; p_src0 += 1; p_src1 += 1; p_dst += 2; } p_src0 += ( i_src0_stride - i_width ); p_src1 += ( i_src1_stride - i_width ); p_dst += ( i_dst_stride - ( i_width * 2 ) ); } } static void core_plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst0, int32_t dst0_stride, uint8_t *p_dst1, int32_t dst1_stride, int32_t i_width, int32_t i_height ) { int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w; uint32_t u_res_w0, u_res_w1; v16u8 in0, in1, in2, in3, in4, in5, in6, in7; v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3; v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3; uint8_t *p_dst; i_w_mul8 = i_width - i_width % 8; i_w_mul4 = i_width - i_width % 4; i_h4w = i_height - i_height % 8; for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; ) { for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; ) { LD_UB8( p_src, i_src_stride, in0, in1, in2, in3, in4, in5, in6, in7 ); p_src += 16; PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 ); PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 ); ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride ); p_dst = p_dst0 + 4 * dst0_stride; ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride ); ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride ); p_dst = p_dst1 + 4 * dst1_stride; ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride ); p_dst0 += 8; p_dst1 += 8; } for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; ) { LD_UB8( p_src, i_src_stride, in0, in1, in2, in3, in4, in5, in6, in7 ); p_src += 8; PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 ); PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 ); ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride ); p_dst = p_dst0 + 4 * dst0_stride; ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride ); ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride ); p_dst = p_dst1 + 4 * dst1_stride; ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride ); p_dst0 += 4; p_dst1 += 4; } for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ ) { p_dst0[0] = p_src[0]; p_dst1[0] = p_src[1]; p_dst0[dst0_stride] = p_src[i_src_stride]; p_dst1[dst1_stride] = p_src[i_src_stride + 1]; p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride]; p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1]; p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride]; p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1]; p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride]; p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1]; p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride]; p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1]; p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride]; p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1]; p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride]; p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1]; p_dst0 += 1; p_dst1 += 1; p_src += 2; } p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) ); p_dst0 += ( ( 8 * dst0_stride ) - i_width ); p_dst1 += ( ( 8 * dst1_stride ) - i_width ); } for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ ) { for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; ) { in0 = LD_UB( p_src ); p_src += 16; vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in0 ); vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in0 ); ST8x1_UB( vec_pckev0, p_dst0 ); ST8x1_UB( vec_pckod0, p_dst1 ); p_dst0 += 8; p_dst1 += 8; } for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; ) { in0 = LD_UB( p_src ); p_src += 8; vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in0 ); vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in0 ); u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 ); SW( u_res_w0, p_dst0 ); u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 ); SW( u_res_w1, p_dst1 ); p_dst0 += 4; p_dst1 += 4; } for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ ) { p_dst0[0] = p_src[0]; p_dst1[0] = p_src[1]; p_dst0 += 1; p_dst1 += 1; p_src += 2; } p_src += ( ( i_src_stride ) - ( i_width << 1 ) ); p_dst0 += ( ( dst0_stride ) - i_width ); p_dst1 += ( ( dst1_stride ) - i_width ); } } static void core_plane_copy_deinterleave_rgb_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst0, int32_t i_dst0_stride, uint8_t *p_dst1, int32_t i_dst1_stride, uint8_t *p_dst2, int32_t i_dst2_stride, int32_t i_width, int32_t i_height ) { uint8_t *p_src_orig = p_src; uint8_t *p_dst0_orig = p_dst0; uint8_t *p_dst1_orig = p_dst1; uint8_t *p_dst2_orig = p_dst2; int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4; v16i8 in0, in1, in2, in3, in4, in5, in6, in7; v16i8 temp0, temp1, temp2, temp3; v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 }; v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 }; v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 }; i_w_mul8 = i_width - i_width % 8; i_h_mul4 = i_height - i_height % 4; for( i_loop_height = ( i_height >> 2 ); i_loop_height--; ) { p_src = p_src_orig; p_dst0 = p_dst0_orig; p_dst1 = p_dst1_orig; p_dst2 = p_dst2_orig; for( i_loop_width = ( i_width >> 3 ); i_loop_width--; ) { LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 ); LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 ); VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 ); VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 ); ST8x1_UB( temp0, p_dst0 ); ST8x1_UB( temp1, p_dst0 + i_dst0_stride ); ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride ); ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride ); VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 ); VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 ); ST8x1_UB( temp0, p_dst1 ); ST8x1_UB( temp1, p_dst1 + i_dst1_stride ); ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride ); ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride ); VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 ); VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 ); ST8x1_UB( temp0, p_dst2 ); ST8x1_UB( temp1, p_dst2 + i_dst2_stride ); ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride ); ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride ); p_src += 8 * 3; p_dst0 += 8; p_dst1 += 8; p_dst2 += 8; } for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) { p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width]; p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width]; p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width]; p_dst0_orig[i_loop_width + i_dst0_stride] = p_src_orig[0 + i_src_stride + 3 * i_loop_width]; p_dst1_orig[i_loop_width + i_dst1_stride] = p_src_orig[1 + i_src_stride + 3 * i_loop_width]; p_dst2_orig[i_loop_width + i_dst2_stride] = p_src_orig[2 + i_src_stride + 3 * i_loop_width]; p_dst0_orig[i_loop_width + 2 * i_dst0_stride] = p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width]; p_dst1_orig[i_loop_width + 2 * i_dst1_stride] = p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width]; p_dst2_orig[i_loop_width + 2 * i_dst2_stride] = p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width]; p_dst0_orig[i_loop_width + 3 * i_dst0_stride] = p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width]; p_dst1_orig[i_loop_width + 3 * i_dst1_stride] = p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width]; p_dst2_orig[i_loop_width + 3 * i_dst2_stride] = p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width]; } p_src_orig += ( 4 * i_src_stride ); p_dst0_orig += ( 4 * i_dst0_stride ); p_dst1_orig += ( 4 * i_dst1_stride ); p_dst2_orig += ( 4 * i_dst2_stride ); } for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ ) { p_src = p_src_orig; p_dst0 = p_dst0_orig; p_dst1 = p_dst1_orig; p_dst2 = p_dst2_orig; for( i_loop_width = ( i_width >> 3 ); i_loop_width--; ) { in0 = LD_SB( p_src ); in4 = LD_SB( p_src + 16 ); temp0 = __msa_vshf_b( mask0, in4, in0 ); ST8x1_UB( temp0, p_dst0 ); temp0 = __msa_vshf_b( mask1, in4, in0 ); ST8x1_UB( temp0, p_dst1 ); temp0 = __msa_vshf_b( mask2, in4, in0 ); ST8x1_UB( temp0, p_dst2 ); p_src += 8 * 3; p_dst0 += 8; p_dst1 += 8; p_dst2 += 8; } for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) { p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width]; p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1]; p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2]; } p_src_orig += ( i_src_stride ); p_dst0_orig += ( i_dst0_stride ); p_dst1_orig += ( i_dst1_stride ); p_dst2_orig += ( i_dst2_stride ); } } static void core_plane_copy_deinterleave_rgba_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst0, int32_t i_dst0_stride, uint8_t *p_dst1, int32_t i_dst1_stride, uint8_t *p_dst2, int32_t i_dst2_stride, int32_t i_width, int32_t i_height ) { uint8_t *p_src_orig = p_src; uint8_t *p_dst0_orig = p_dst0; uint8_t *p_dst1_orig = p_dst1; uint8_t *p_dst2_orig = p_dst2; int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4; v16i8 in0, in1, in2, in3, in4, in5, in6, in7; v16i8 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15; i_w_mul8 = i_width - i_width % 8; i_h_mul4 = i_height - i_height % 4; for( i_loop_height = ( i_height >> 2 ); i_loop_height--; ) { p_src = p_src_orig; p_dst0 = p_dst0_orig; p_dst1 = p_dst1_orig; p_dst2 = p_dst2_orig; for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) { LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 ); LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 ); LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 ); LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 ); PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 ); temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 ); PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 ); temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 ); temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 ); PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 ); temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 ); temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 ); PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 ); temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 ); temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 ); PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 ); in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 ); PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 ); in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 ); PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 ); in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 ); PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 ); in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 ); ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride ); ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride ); ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride ); p_src += 16 * 4; p_dst0 += 16; p_dst1 += 16; p_dst2 += 16; } for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; ) { LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 ); LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 ); PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 ); temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 ); PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 ); temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 ); temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 ); PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 ); in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 ); PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 ); in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 ); PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 ); in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 ); PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 ); in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 ); ST8x1_UB( in0, p_dst0 ); ST8x1_UB( in4, p_dst0 + i_dst0_stride ); ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride ); ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride ); ST8x1_UB( in1, p_dst2 ); ST8x1_UB( in5, p_dst2 + i_dst2_stride ); ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride ); ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride ); ST8x1_UB( in2, p_dst1 ); ST8x1_UB( in6, p_dst1 + i_dst1_stride ); ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride ); ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride ); p_src += 8 * 4; p_dst0 += 8; p_dst1 += 8; p_dst2 += 8; } for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) { p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width]; p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1]; p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2]; p_dst0_orig[i_dst0_stride + i_loop_width] = p_src_orig[i_src_stride + 4 * i_loop_width]; p_dst1_orig[i_dst1_stride + i_loop_width] = p_src_orig[i_src_stride + 4 * i_loop_width + 1]; p_dst2_orig[i_dst2_stride + i_loop_width] = p_src_orig[i_src_stride + 4 * i_loop_width + 2]; p_dst0_orig[2 * i_dst0_stride + i_loop_width] = p_src_orig[2 * i_src_stride + 4 * i_loop_width]; p_dst1_orig[2 * i_dst1_stride + i_loop_width] = p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1]; p_dst2_orig[2 * i_dst2_stride + i_loop_width] = p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2]; p_dst0_orig[3 * i_dst0_stride + i_loop_width] = p_src_orig[3 * i_src_stride + 4 * i_loop_width]; p_dst1_orig[3 * i_dst1_stride + i_loop_width] = p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1]; p_dst2_orig[3 * i_dst2_stride + i_loop_width] = p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2]; } p_src_orig += ( 4 * i_src_stride ); p_dst0_orig += ( 4 * i_dst0_stride ); p_dst1_orig += ( 4 * i_dst1_stride ); p_dst2_orig += ( 4 * i_dst2_stride ); } for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ ) { p_src = p_src_orig; p_dst0 = p_dst0_orig; p_dst1 = p_dst1_orig; p_dst2 = p_dst2_orig; for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) { LD_SB4( p_src, 16, in0, in4, in8, in12 ); PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 ); temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 ); PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 ); in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 ); ST_SB( in0, p_dst0 ); ST_SB( in0, p_dst0 ); ST_SB( in1, p_dst2 ); ST_SB( in1, p_dst2 ); ST_SB( in2, p_dst1 ); ST_SB( in2, p_dst1 ); p_src += 16 * 4; p_dst0 += 16; p_dst1 += 16; p_dst2 += 16; } for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; ) { in0 = LD_SB( p_src ); in4 = LD_SB( p_src + 16 ); temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 ); temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 ); in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 ); ST8x1_UB( in0, p_dst0 ); ST8x1_UB( in1, p_dst2 ); ST8x1_UB( in2, p_dst1 ); p_src += 8 * 4; p_dst0 += 8; p_dst1 += 8; p_dst2 += 8; } for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) { p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width]; p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1]; p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2]; } p_src_orig += ( i_src_stride ); p_dst0_orig += ( i_dst0_stride ); p_dst1_orig += ( i_dst1_stride ); p_dst2_orig += ( i_dst2_stride ); } } static void core_store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride, uint8_t *p_src1, int32_t i_src1_stride, uint8_t *p_dst, int32_t i_dst_stride, int32_t i_height ) { int32_t i_loop_height, i_h4w; v16u8 in0, in1, in2, in3, in4, in5, in6, in7; v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3; i_h4w = i_height % 4; for( i_loop_height = ( i_height >> 2 ); i_loop_height--; ) { LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 ); p_src0 += ( 4 * i_src0_stride ); LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 ); p_src1 += ( 4 * i_src1_stride ); ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3, ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 ); ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } for( i_loop_height = i_h4w; i_loop_height--; ) { in0 = LD_UB( p_src0 ); p_src0 += ( i_src0_stride ); in1 = LD_UB( p_src1 ); p_src1 += ( i_src1_stride ); ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 ); ST_UB( ilvr_vec0, p_dst ); p_dst += ( i_dst_stride ); } } static void core_frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst0, int32_t dst0_stride, uint8_t *p_dst1, int32_t dst1_stride, uint8_t *p_dst2, int32_t dst2_stride, uint8_t *p_dst3, int32_t dst3_stride, int32_t i_width, int32_t i_height ) { int32_t i_loop_width, i_loop_height, i_w16_mul; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5; v16u8 pckev_vec0, pckev_vec1, pckev_vec2; v16u8 pckod_vec0, pckod_vec1, pckod_vec2; v16u8 tmp0, tmp1, tmp2, tmp3; v16u8 res0, res1; i_w16_mul = i_width - i_width % 16; for( i_loop_height = i_height; i_loop_height--; ) { LD_UB3( p_src, i_src_stride, src0, src1, src2 ); p_src += 16; for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ ) { LD_UB3( p_src, i_src_stride, src3, src4, src5 ); p_src += 16; LD_UB3( p_src, i_src_stride, src6, src7, src8 ); p_src += 16; PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 ); PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 ); pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5, ( v16i8 ) src2 ); pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5, ( v16i8 ) src2 ); AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, tmp0, tmp1, tmp2, tmp3 ); AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); ST_UB( res0, p_dst0 ); ST_UB( res1, p_dst2 ); SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 ); SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 ); SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 ); PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1, pckev_vec0, pckev_vec1 ) pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5, ( v16i8 ) sld1_vec2 ); AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, tmp0, tmp1, tmp2, tmp3 ); AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); ST_UB( res0, p_dst1 ); ST_UB( res1, p_dst3 ); src0 = src6; src1 = src7; src2 = src8; p_dst0 += 16; p_dst1 += 16; p_dst2 += 16; p_dst3 += 16; } for( i_loop_width = i_w16_mul; i_loop_width < i_width; i_loop_width += 8 ) { LD_UB3( p_src, i_src_stride, src3, src4, src5 ); p_src += 16; PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 ); PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 ); pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5, ( v16i8 ) src2 ); pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5, ( v16i8 ) src2 ); AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, tmp0, tmp1, tmp2, tmp3 ); AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); ST8x1_UB( res0, p_dst0 ); ST8x1_UB( res1, p_dst2 ); SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 ); SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 ); SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 ); PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1, pckev_vec0, pckev_vec1 ) pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5, ( v16i8 ) sld1_vec2 ); AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, tmp0, tmp1, tmp2, tmp3 ); AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); ST8x1_UB( res0, p_dst1 ); ST8x1_UB( res1, p_dst3 ); p_dst0 += 8; p_dst1 += 8; p_dst2 += 8; p_dst3 += 8; } p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) ); p_dst0 += ( dst0_stride - i_width ); p_dst1 += ( dst1_stride - i_width ); p_dst2 += ( dst2_stride - i_width ); p_dst3 += ( dst3_stride - i_width ); } } static void mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, int32_t i_height ) { copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); } static void mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, int32_t i_height ) { copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); } static void mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, int32_t i_height ) { copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); } static void pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 4 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 4, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 4, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 16, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 8, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 4 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 4, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 4, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride, uint8_t *p_pix2, intptr_t pix2_stride, uint8_t *p_pix3, intptr_t pix3_stride, int32_t i_weight ) { if( 32 == i_weight ) { avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 2 ); } else if( i_weight < 0 || i_weight > 63 ) { avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 5, i_weight, ( 64 - i_weight ), 0 ); } else { avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, p_pix1, pix1_stride, 5, i_weight, ( 64 - i_weight ), 0 ); } } static void memzero_aligned_msa( void *p_dst, size_t n ) { uint32_t u_tot32_mul_lines = n >> 5; uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 ); memset_zero_16width_msa( p_dst, 16, ( n / 16 ) ); if( u_remaining ) { memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining ); } } static void mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, const x264_weight_t *pWeight, int32_t i_height ) { int32_t i_log2_denom = pWeight->i_denom; int32_t i_offset = pWeight->i_offset; int32_t i_weight = pWeight->i_scale; avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height, i_log2_denom, i_weight, i_offset ); } static void mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, const x264_weight_t *pWeight, int32_t i_height ) { int32_t i_log2_denom = pWeight->i_denom; int32_t i_offset = pWeight->i_offset; int32_t i_weight = pWeight->i_scale; avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height, i_log2_denom, i_weight, i_offset ); } static void mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, const x264_weight_t *pWeight, int32_t i_height ) { int32_t i_log2_denom = pWeight->i_denom; int32_t i_offset = pWeight->i_offset; int32_t i_weight = pWeight->i_scale; avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height, i_log2_denom, i_weight, i_offset ); } static void mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, const x264_weight_t *pWeight, int32_t i_height ) { mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride, pWeight, i_height ); mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride, pWeight, i_height ); } static void mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src[4], intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height, const x264_weight_t *pWeight ) { int32_t i_qpel_idx; int32_t i_offset; uint8_t *p_src1; i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + ( 3 == ( m_vy & 3 ) ) * i_src_stride; if( i_qpel_idx & 5 ) { uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + i_offset + ( 3 == ( m_vx&3 ) ); if( 16 == i_width ) { avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, i_dst_stride, i_height ); } else if( 8 == i_width ) { avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, i_dst_stride, i_height ); } else if( 4 == i_width ) { avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, i_dst_stride, i_height ); } if( pWeight->weightfn ) { if( 16 == i_width ) { mc_weight_w16_msa( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); } else if( 8 == i_width ) { mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); } else if( 4 == i_width ) { mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); } } } else if( pWeight->weightfn ) { if( 16 == i_width ) { mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); } else if( 8 == i_width ) { mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); } else if( 4 == i_width ) { mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); } } else { if( 16 == i_width ) { copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride, i_height ); } else if( 8 == i_width ) { copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride, i_height ); } else if( 4 == i_width ) { copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride, i_height ); } } } static void mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v, intptr_t i_dst_stride, uint8_t *p_src, intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height ) { int32_t i_d8x = m_vx & 0x07; int32_t i_d8y = m_vy & 0x07; int32_t i_coeff_horiz1 = ( 8 - i_d8x ); int32_t i_coeff_vert1 = ( 8 - i_d8y ); int32_t i_coeff_horiz0 = i_d8x; int32_t i_coeff_vert0 = i_d8y; p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2; if( 2 == i_width ) { avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, i_coeff_horiz0, i_coeff_horiz1, i_coeff_vert0, i_coeff_vert1, i_height ); } else if( 4 == i_width ) { avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, i_coeff_horiz0, i_coeff_horiz1, i_coeff_vert0, i_coeff_vert1, i_height ); } else if( 8 == i_width ) { avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride, p_dst_u, p_dst_v, i_dst_stride, i_coeff_horiz0, i_coeff_horiz1, i_coeff_vert0, i_coeff_vert1, i_height ); } } static void hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v, uint8_t *p_dstc, uint8_t *p_src, intptr_t i_stride, int32_t i_width, int32_t i_height, int16_t *p_buf ) { for( int32_t i = 0; i < ( i_width / 16 ); i++ ) { avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride, p_dst_v - 2, i_stride, i_height ); avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride, p_dstc, i_stride, i_height ); avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height ); p_src += 16; p_dst_v += 16; p_dsth += 16; p_dstc += 16; } } static void plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src0, intptr_t i_src_stride0, uint8_t *p_src1, intptr_t i_src_stride1, int32_t i_width, int32_t i_height ) { core_plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1, p_dst, i_dst_stride, i_width, i_height ); } static void plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0, uint8_t *p_dst1, intptr_t i_dst_stride1, uint8_t *p_src, intptr_t i_src_stride, int32_t i_width, int32_t i_height ) { core_plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0, p_dst1, i_dst_stride1, i_width, i_height ); } static void plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0, intptr_t i_dst_stride0, uint8_t *p_dst1, intptr_t i_dst_stride1, uint8_t *p_dst2, intptr_t i_dst_stride2, uint8_t *p_src, intptr_t i_src_stride, int32_t i_src_width, int32_t i_width, int32_t i_height ) { if( 3 == i_src_width ) { core_plane_copy_deinterleave_rgb_msa( p_src, i_src_stride, p_dst0, i_dst_stride0, p_dst1, i_dst_stride1, p_dst2, i_dst_stride2, i_width, i_height ); } else if( 4 == i_src_width ) { core_plane_copy_deinterleave_rgba_msa( p_src, i_src_stride, p_dst0, i_dst_stride0, p_dst1, i_dst_stride1, p_dst2, i_dst_stride2, i_width, i_height ); } } static void store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src0, uint8_t *p_src1, int32_t i_height ) { core_store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE, p_dst, i_dst_stride, i_height ); } static void load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src, intptr_t i_src_stride, int32_t i_height ) { core_plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE, ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE, 8, i_height ); } static void load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src, intptr_t i_src_stride, int32_t i_height ) { core_plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE, ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE, 8, i_height ); } static void frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0, uint8_t *p_dst1, uint8_t *p_dst2, uint8_t *p_dst3, intptr_t i_src_stride, intptr_t i_dst_stride, int32_t i_width, int32_t i_height ) { core_frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride, p_dst1, i_dst_stride, p_dst2, i_dst_stride, p_dst3, i_dst_stride, i_width, i_height ); } static uint8_t *get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride, uint8_t *p_src[4], intptr_t i_src_stride, int32_t m_vx, int32_t m_vy, int32_t i_width, int32_t i_height, const x264_weight_t *pWeight ) { int32_t i_qpel_idx, i_cnt, i_h4w; int32_t i_offset; uint8_t *p_src1, *src1_org; i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + ( 3 == ( m_vy & 3 ) ) * i_src_stride; i_h4w = i_height - i_height%4; if( i_qpel_idx & 5 ) { uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + i_offset + ( 3 == ( m_vx & 3 ) ); if( 16 == i_width ) { avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, *p_dst_stride, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { v16u8 src_vec1, src_vec2; v16u8 dst_vec0; src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) ); } } else if( 20 == i_width ) { avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, *p_dst_stride, i_h4w ); avg_src_width4_msa( p_src1 + 16, i_src_stride, p_src2 + 16, i_src_stride, p_dst + 16, *p_dst_stride, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { v16u8 src_vec1, src_vec2, src_vec3, src_vec4; v16u8 dst_vec0, dst_vec1; uint32_t temp0; src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 ); src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 ); dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 ); temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 ); ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) ); SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 ); } } else if( 12 == i_width ) { avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, *p_dst_stride, i_h4w ); avg_src_width4_msa( p_src1 + 8, i_src_stride, p_src2 + 8, i_src_stride, p_dst + 8, *p_dst_stride, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint32_t temp0; uint64_t dst0; v16u8 src_vec1, src_vec2; v16u8 dst_vec0; src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 ); temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 ); SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) ); SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 ); } } else if( 8 == i_width ) { avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, *p_dst_stride, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint64_t dst0; v16u8 src_vec1, src_vec2; v16u8 dst_vec0; src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 ); SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) ); } } else if( 4 == i_width ) { avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride, p_dst, *p_dst_stride, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint32_t temp0; v16u8 src_vec1, src_vec2; v16u8 dst_vec0; src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 ); SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } if( pWeight->weightfn ) { int32_t i_log2_denom; int32_t i_offset_val; int32_t i_weight; i_log2_denom = pWeight->i_denom; i_offset_val = pWeight->i_offset; i_weight = pWeight->i_scale; if( 16 == i_width || 12 == i_width ) { mc_weight_w16_msa( p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0, temp_vec1; v8u16 wgt, offset_val0; v8i16 denom; i_offset_val <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) ); temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec1 = wgt * temp_vec1; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, ( v8i16 ) offset_val0 ); temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, ( v8i16 ) offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, ( v16i8 ) temp_vec0 ); ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } else if( 20 == i_width ) { mc_weight_w20_msa( p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint32_t temp0; v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0, temp_vec1; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) ); temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 ); temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec1 = wgt * temp_vec1; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, ( v16i8 ) temp_vec0 ); ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); src_vec0 = ( v16u8 ) __msa_fill_w( temp0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, ( v16i8 ) temp_vec0 ); temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 ); } } else if( 8 == i_width ) { mc_weight_w8_msa( p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint64_t temp0; v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val = i_offset_val << i_log2_denom; if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, ( v16i8 ) temp_vec0 ); temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 ); SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } else if( 4 == i_width ) { mc_weight_w4_msa( p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_h4w ); for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint32_t temp0; v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) ); src_vec0 = ( v16u8 ) __msa_fill_w( temp0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, ( v16i8 ) temp_vec0 ); temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } } return p_dst; } else if( pWeight->weightfn ) { int32_t i_offset_val, i_log2_denom, i_weight; i_log2_denom = pWeight->i_denom; i_offset_val = pWeight->i_offset; i_weight = pWeight->i_scale; i_h4w = i_height - i_height%4; src1_org = p_src1; if( 16 == i_width || 12 == i_width ) { mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_h4w ); p_src1 = src1_org + i_h4w * i_src_stride; for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0, temp_vec1; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); src_vec0 = LD_UB( p_src1 ); p_src1 += i_src_stride; temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec1 = wgt * temp_vec1; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, ( v16i8 ) temp_vec0 ); ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } else if( 20 == i_width ) { mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_h4w ); p_src1 = src1_org + i_h4w * i_src_stride; for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint32_t temp0; v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0, temp_vec1; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); src_vec0 = LD_UB( p_src1 ); temp0 = LW( p_src1 + 16 ); p_src1 += i_src_stride; temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec1 = wgt * temp_vec1; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, ( v16i8 ) temp_vec0 ); ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); src_vec0 = ( v16u8 ) __msa_fill_w( temp0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, ( v16i8 ) temp_vec0 ); temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 ); } } else if( 8 == i_width ) { mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_h4w ); p_src1 = src1_org + i_h4w * i_src_stride; for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint64_t u_temp0; v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val = i_offset_val << i_log2_denom; if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); src_vec0 = LD_UB( p_src1 ); p_src1 += i_src_stride; temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, ( v16i8 ) temp_vec0 ); u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 ); SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } else if( 4 == i_width ) { mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_h4w ); p_src1 = src1_org + i_h4w * i_src_stride; for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) { uint32_t u_temp0; v16i8 zero = {0}; v16u8 src_vec0; v16i8 tmp0; v8u16 temp_vec0; v8u16 wgt; v8i16 denom, offset_val0; i_offset_val <<= ( i_log2_denom ); if( i_log2_denom ) { i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); } wgt = ( v8u16 ) __msa_fill_h( i_weight ); offset_val0 = __msa_fill_h( i_offset_val ); denom = __msa_fill_h( i_log2_denom ); u_temp0 = LW( p_src1 ); p_src1 += i_src_stride; src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 ); temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); temp_vec0 = wgt * temp_vec0; temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, offset_val0 ); temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, ( v16i8 ) temp_vec0 ); u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) ); } } return p_dst; } else { *p_dst_stride = i_src_stride; return p_src1; } } static weight_fn_t mc_weight_wtab_msa[6] = { mc_weight_w4_msa, mc_weight_w4_msa, mc_weight_w8_msa, mc_weight_w16_msa, mc_weight_w16_msa, mc_weight_w20_msa, }; #endif // !HIGH_BIT_DEPTH void x264_mc_init_mips( uint32_t cpu, x264_mc_functions_t *pf ) { #if !HIGH_BIT_DEPTH if( cpu & X264_CPU_MSA ) { pf->mc_luma = mc_luma_msa; pf->mc_chroma = mc_chroma_msa; pf->get_ref = get_ref_msa; pf->avg[PIXEL_16x16]= pixel_avg_16x16_msa; pf->avg[PIXEL_16x8] = pixel_avg_16x8_msa; pf->avg[PIXEL_8x16] = pixel_avg_8x16_msa; pf->avg[PIXEL_8x8] = pixel_avg_8x8_msa; pf->avg[PIXEL_8x4] = pixel_avg_8x4_msa; pf->avg[PIXEL_4x16] = pixel_avg_4x16_msa; pf->avg[PIXEL_4x8] = pixel_avg_4x8_msa; pf->avg[PIXEL_4x4] = pixel_avg_4x4_msa; pf->avg[PIXEL_4x2] = pixel_avg_4x2_msa; pf->weight = mc_weight_wtab_msa; pf->offsetadd = mc_weight_wtab_msa; pf->offsetsub = mc_weight_wtab_msa; pf->copy_16x16_unaligned = mc_copy_w16_msa; pf->copy[PIXEL_16x16] = mc_copy_w16_msa; pf->copy[PIXEL_8x8] = mc_copy_w8_msa; pf->copy[PIXEL_4x4] = mc_copy_w4_msa; pf->store_interleave_chroma = store_interleave_chroma_msa; pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc_msa; pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec_msa; pf->plane_copy_interleave = plane_copy_interleave_msa; pf->plane_copy_deinterleave = plane_copy_deinterleave_msa; pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_msa; pf->hpel_filter = hpel_filter_msa; pf->memcpy_aligned = memcpy; pf->memzero_aligned = memzero_aligned_msa; pf->frame_init_lowres_core = frame_init_lowres_core_msa; } #endif // !HIGH_BIT_DEPTH } x264-master/common/mips/mc.h000066400000000000000000000025531502133446700160740ustar00rootroot00000000000000/***************************************************************************** * mc.h: msa motion compensation ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Neha Rana * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_MC_H #define X264_MIPS_MC_H #define x264_mc_init_mips x264_template(mc_init_mips) void x264_mc_init_mips( uint32_t cpu, x264_mc_functions_t *pf ); #endif x264-master/common/mips/pixel-c.c000066400000000000000000001547151502133446700170410ustar00rootroot00000000000000/***************************************************************************** * pixel-c.c: msa pixel metrics ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Mandar Sahastrabuddhe * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macros.h" #include "pixel.h" #include "predict.h" #if !HIGH_BIT_DEPTH #define CALC_MSE_B( src, ref, var ) \ { \ v16u8 src_l0_m, src_l1_m; \ v8i16 res_l0_m, res_l1_m; \ \ ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \ HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \ DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \ } #define CALC_MSE_AVG_B( src, ref, var, sub ) \ { \ v16u8 src_l0_m, src_l1_m; \ v8i16 res_l0_m, res_l1_m; \ \ ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \ HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \ DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \ \ sub += res_l0_m + res_l1_m; \ } #define VARIANCE_WxH( sse, diff, shift ) \ ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) ) static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height ) { int32_t i_ht_cnt; uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3; v16u8 src = { 0 }; v16u8 ref = { 0 }; v16u8 diff; v8u16 sad = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 ); p_src += ( 4 * i_src_stride ); LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 ); p_ref += ( 4 * i_ref_stride ); INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src ); INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref ); diff = __msa_asub_u_b( src, ref ); sad += __msa_hadd_u_h( diff, diff ); } return ( HADD_UH_U32( sad ) ); } static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height ) { int32_t i_ht_cnt; v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; v8u16 sad = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref += ( 4 * i_ref_stride ); PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, ref0, ref1 ); sad += SAD_UB2_UH( src0, src1, ref0, ref1 ); } return ( HADD_UH_U32( sad ) ); } static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height ) { int32_t i_ht_cnt; v16u8 src0, src1, ref0, ref1; v8u16 sad = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LD_UB2( p_src, i_src_stride, src0, src1 ); p_src += ( 2 * i_src_stride ); LD_UB2( p_ref, i_ref_stride, ref0, ref1 ); p_ref += ( 2 * i_ref_stride ); sad += SAD_UB2_UH( src0, src1, ref0, ref1 ); LD_UB2( p_src, i_src_stride, src0, src1 ); p_src += ( 2 * i_src_stride ); LD_UB2( p_ref, i_ref_stride, ref0, ref1 ); p_ref += ( 2 * i_ref_stride ); sad += SAD_UB2_UH( src0, src1, ref0, ref1 ); } return ( HADD_UH_U32( sad ) ); } static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, int32_t i_ref_stride, int32_t i_height, uint32_t *pu_sad_array ) { int32_t i_ht_cnt; v16u8 src = { 0 }; uint32_t src0, src1, src2, src3, load0, load1, load2, load3; v16u8 ref0 = { 0 }; v16u8 ref1 = { 0 }; v16u8 ref2 = { 0 }; v16u8 diff; v8u16 sad0 = { 0 }; v8u16 sad1 = { 0 }; v8u16 sad2 = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LW4( p_src, i_src_stride, src0, src1, src2, src3 ); INSERT_W4_UB( src0, src1, src2, src3, src ); p_src += ( 4 * i_src_stride ); LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 ); INSERT_W4_UB( load0, load1, load2, load3, ref0 ); p_ref0 += ( 4 * i_ref_stride ); LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 ); INSERT_W4_UB( load0, load1, load2, load3, ref1 ); p_ref1 += ( 4 * i_ref_stride ); LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 ); INSERT_W4_UB( load0, load1, load2, load3, ref2 ); p_ref2 += ( 4 * i_ref_stride ); diff = __msa_asub_u_b( src, ref0 ); sad0 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref1 ); sad1 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref2 ); sad2 += __msa_hadd_u_h( diff, diff ); } pu_sad_array[0] = HADD_UH_U32( sad0 ); pu_sad_array[1] = HADD_UH_U32( sad1 ); pu_sad_array[2] = HADD_UH_U32( sad2 ); } static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, int32_t i_ref_stride, int32_t i_height, uint32_t *pu_sad_array ) { int32_t i_ht_cnt; v16u8 src0, src1, src2, src3; v16u8 ref0, ref1, ref00, ref11, ref22, ref33; v8u16 sad0 = { 0 }; v8u16 sad1 = { 0 }; v8u16 sad2 = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 ); p_ref0 += ( 4 * i_ref_stride ); PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, ref0, ref1 ); sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 ); LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 ); p_ref1 += ( 4 * i_ref_stride ); PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 ); sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 ); LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 ); p_ref2 += ( 4 * i_ref_stride ); PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 ); sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 ); } pu_sad_array[0] = HADD_UH_U32( sad0 ); pu_sad_array[1] = HADD_UH_U32( sad1 ); pu_sad_array[2] = HADD_UH_U32( sad2 ); } static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, int32_t i_ref_stride, int32_t i_height, uint32_t *pu_sad_array ) { int32_t i_ht_cnt; v16u8 src, ref; v16u8 diff; v8u16 sad0 = { 0 }; v8u16 sad1 = { 0 }; v8u16 sad2 = { 0 }; for( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; ) { src = LD_UB( p_src ); p_src += i_src_stride; ref = LD_UB( p_ref0 ); p_ref0 += i_ref_stride; diff = __msa_asub_u_b( src, ref ); sad0 += __msa_hadd_u_h( diff, diff ); ref = LD_UB( p_ref1 ); p_ref1 += i_ref_stride; diff = __msa_asub_u_b( src, ref ); sad1 += __msa_hadd_u_h( diff, diff ); ref = LD_UB( p_ref2 ); p_ref2 += i_ref_stride; diff = __msa_asub_u_b( src, ref ); sad2 += __msa_hadd_u_h( diff, diff ); src = LD_UB( p_src ); p_src += i_src_stride; ref = LD_UB( p_ref0 ); p_ref0 += i_ref_stride; diff = __msa_asub_u_b( src, ref ); sad0 += __msa_hadd_u_h( diff, diff ); ref = LD_UB( p_ref1 ); p_ref1 += i_ref_stride; diff = __msa_asub_u_b( src, ref ); sad1 += __msa_hadd_u_h( diff, diff ); ref = LD_UB( p_ref2 ); p_ref2 += i_ref_stride; diff = __msa_asub_u_b( src, ref ); sad2 += __msa_hadd_u_h( diff, diff ); } pu_sad_array[0] = HADD_UH_U32( sad0 ); pu_sad_array[1] = HADD_UH_U32( sad1 ); pu_sad_array[2] = HADD_UH_U32( sad2 ); } static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_aref[], int32_t i_ref_stride, int32_t i_height, uint32_t *pu_sad_array ) { uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3; int32_t i_ht_cnt; uint32_t src0, src1, src2, src3; uint32_t ref0, ref1, ref2, ref3; v16u8 src = { 0 }; v16u8 ref = { 0 }; v16u8 diff; v8u16 sad0 = { 0 }; v8u16 sad1 = { 0 }; v8u16 sad2 = { 0 }; v8u16 sad3 = { 0 }; p_ref0 = p_aref[0]; p_ref1 = p_aref[1]; p_ref2 = p_aref[2]; p_ref3 = p_aref[3]; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LW4( p_src, i_src_stride, src0, src1, src2, src3 ); INSERT_W4_UB( src0, src1, src2, src3, src ); p_src += ( 4 * i_src_stride ); LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 ); INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); p_ref0 += ( 4 * i_ref_stride ); diff = __msa_asub_u_b( src, ref ); sad0 += __msa_hadd_u_h( diff, diff ); LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 ); INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); p_ref1 += ( 4 * i_ref_stride ); diff = __msa_asub_u_b( src, ref ); sad1 += __msa_hadd_u_h( diff, diff ); LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 ); INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); p_ref2 += ( 4 * i_ref_stride ); diff = __msa_asub_u_b( src, ref ); sad2 += __msa_hadd_u_h( diff, diff ); LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 ); INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); p_ref3 += ( 4 * i_ref_stride ); diff = __msa_asub_u_b( src, ref ); sad3 += __msa_hadd_u_h( diff, diff ); } pu_sad_array[0] = HADD_UH_U32( sad0 ); pu_sad_array[1] = HADD_UH_U32( sad1 ); pu_sad_array[2] = HADD_UH_U32( sad2 ); pu_sad_array[3] = HADD_UH_U32( sad3 ); } static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_aref[], int32_t i_ref_stride, int32_t i_height, uint32_t *pu_sad_array ) { int32_t i_ht_cnt; uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3; v16u8 src0, src1, src2, src3; v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; v8u16 sad0 = { 0 }; v8u16 sad1 = { 0 }; v8u16 sad2 = { 0 }; v8u16 sad3 = { 0 }; p_ref0 = p_aref[0]; p_ref1 = p_aref[1]; p_ref2 = p_aref[2]; p_ref3 = p_aref[3]; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref0 += ( 4 * i_ref_stride ); LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 ); p_ref1 += ( 4 * i_ref_stride ); LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 ); p_ref2 += ( 4 * i_ref_stride ); LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 ); p_ref3 += ( 4 * i_ref_stride ); PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 ); PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 ); sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 ); PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 ); sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 ); PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 ); sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 ); PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 ); sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 ); } pu_sad_array[0] = HADD_UH_U32( sad0 ); pu_sad_array[1] = HADD_UH_U32( sad1 ); pu_sad_array[2] = HADD_UH_U32( sad2 ); pu_sad_array[3] = HADD_UH_U32( sad3 ); } static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_aref[], int32_t i_ref_stride, int32_t i_height, uint32_t *pu_sad_array ) { int32_t i_ht_cnt; uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3; v16u8 src, ref0, ref1, ref2, ref3, diff; v8u16 sad0 = { 0 }; v8u16 sad1 = { 0 }; v8u16 sad2 = { 0 }; v8u16 sad3 = { 0 }; p_ref0 = p_aref[0]; p_ref1 = p_aref[1]; p_ref2 = p_aref[2]; p_ref3 = p_aref[3]; for( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; ) { src = LD_UB( p_src ); p_src += i_src_stride; ref0 = LD_UB( p_ref0 ); p_ref0 += i_ref_stride; ref1 = LD_UB( p_ref1 ); p_ref1 += i_ref_stride; ref2 = LD_UB( p_ref2 ); p_ref2 += i_ref_stride; ref3 = LD_UB( p_ref3 ); p_ref3 += i_ref_stride; diff = __msa_asub_u_b( src, ref0 ); sad0 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref1 ); sad1 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref2 ); sad2 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref3 ); sad3 += __msa_hadd_u_h( diff, diff ); src = LD_UB( p_src ); p_src += i_src_stride; ref0 = LD_UB( p_ref0 ); p_ref0 += i_ref_stride; ref1 = LD_UB( p_ref1 ); p_ref1 += i_ref_stride; ref2 = LD_UB( p_ref2 ); p_ref2 += i_ref_stride; ref3 = LD_UB( p_ref3 ); p_ref3 += i_ref_stride; diff = __msa_asub_u_b( src, ref0 ); sad0 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref1 ); sad1 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref2 ); sad2 += __msa_hadd_u_h( diff, diff ); diff = __msa_asub_u_b( src, ref3 ); sad3 += __msa_hadd_u_h( diff, diff ); } pu_sad_array[0] = HADD_UH_U32( sad0 ); pu_sad_array[1] = HADD_UH_U32( sad1 ); pu_sad_array[2] = HADD_UH_U32( sad2 ); pu_sad_array[3] = HADD_UH_U32( sad3 ); } static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride, uint8_t i_height ) { uint32_t u_sum = 0, u_sqr_out = 0, u_cnt; v16i8 pix, zero = { 0 }; v8u16 add, pix_r, pix_l; v4u32 sqr = { 0 }; for( u_cnt = i_height; u_cnt--; ) { pix = LD_SB( p_pix ); p_pix += i_stride; add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix ); u_sum += HADD_UH_U32( add ); ILVRL_B2_UH( zero, pix, pix_r, pix_l ); sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r ); sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l ); } u_sqr_out = HADD_SW_S32( sqr ); return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) ); } static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride, uint8_t i_height ) { uint32_t u_sum = 0, u_sqr_out = 0, u_cnt; v16i8 pix, zero = { 0 }; v8u16 add, pix_r; v4u32 sqr = { 0 }; for( u_cnt = i_height; u_cnt--; ) { pix = LD_SB( p_pix ); p_pix += i_stride; pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix ); add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r ); u_sum += HADD_UH_U32( add ); sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r ); } u_sqr_out = HADD_SW_S32( sqr ); return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) ); } static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height, int32_t *p_diff ) { int32_t i_ht_cnt; uint32_t u_sse; v16u8 src0, src1, src2, src3; v16u8 ref0, ref1, ref2, ref3; v8i16 avg = { 0 }; v4i32 vec, var = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref += ( 4 * i_ref_stride ); PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, ref0, ref1 ); CALC_MSE_AVG_B( src0, ref0, var, avg ); CALC_MSE_AVG_B( src1, ref1, var, avg ); } vec = __msa_hadd_s_w( avg, avg ); *p_diff = HADD_SW_S32( vec ); u_sse = HADD_SW_S32( var ); return u_sse; } static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height ) { int32_t i_ht_cnt; uint32_t u_sse; uint32_t u_src0, u_src1, u_src2, u_src3; uint32_t u_ref0, u_ref1, u_ref2, u_ref3; v16u8 src = { 0 }; v16u8 ref = { 0 }; v4i32 var = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 ); p_src += ( 4 * i_src_stride ); LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 ); p_ref += ( 4 * i_ref_stride ); INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src ); INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref ); CALC_MSE_B( src, ref, var ); } u_sse = HADD_SW_S32( var ); return u_sse; } static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height ) { int32_t i_ht_cnt; uint32_t u_sse; v16u8 src0, src1, src2, src3; v16u8 ref0, ref1, ref2, ref3; v4i32 var = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref += ( 4 * i_ref_stride ); PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, ref0, ref1 ); CALC_MSE_B( src0, ref0, var ); CALC_MSE_B( src1, ref1, var ); } u_sse = HADD_SW_S32( var ); return u_sse; } static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, int32_t i_height ) { int32_t i_ht_cnt; uint32_t u_sse; v16u8 src, ref; v4i32 var = { 0 }; for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) { src = LD_UB( p_src ); p_src += i_src_stride; ref = LD_UB( p_ref ); p_ref += i_ref_stride; CALC_MSE_B( src, ref, var ); src = LD_UB( p_src ); p_src += i_src_stride; ref = LD_UB( p_ref ); p_ref += i_ref_stride; CALC_MSE_B( src, ref, var ); src = LD_UB( p_src ); p_src += i_src_stride; ref = LD_UB( p_ref ); p_ref += i_ref_stride; CALC_MSE_B( src, ref, var ); src = LD_UB( p_src ); p_src += i_src_stride; ref = LD_UB( p_ref ); p_ref += i_ref_stride; CALC_MSE_B( src, ref, var ); } u_sse = HADD_SW_S32( var ); return u_sse; } static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride, const uint8_t *p_ref, int32_t i_ref_stride, int32_t pi_sum_array[2][4] ) { v16i8 zero = { 0 }; v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; v8u16 temp0, temp1, temp2, temp3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4u32 tmp0; v4i32 tmp2, tmp3; LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += ( 4 * i_src_stride ); LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref += ( 4 * i_ref_stride ); ILVR_D2_UB( src1, src0, src3, src2, src0, src2 ); ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 ); HADD_UB2_UH( src0, src2, temp0, temp1 ); temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 ); pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 ); HADD_UB2_UH( ref0, ref2, temp0, temp1 ); temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 ); pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 ); ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2, vec4, vec6 ); ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3, vec5, vec7 ); tmp0 = __msa_dotp_u_w( vec0, vec0 ); tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 ); tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 ); tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 ); tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 ); tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 ); tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 ); tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 ); tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 ); tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 ); pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 ); pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 ); tmp0 = __msa_dotp_u_w( vec4, vec0 ); tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 ); tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 ); tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 ); tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 ); tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 ); pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 ); pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 ); } static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, uint8_t i_height ) { int32_t cnt; uint32_t u_sum = 0; v16i8 src0, src1, src2, src3; v16i8 ref0, ref1, ref2, ref3; v8i16 zero = { 0 }; v8i16 diff0, diff1, diff2, diff3; v8i16 temp0, temp1, temp2, temp3; for( cnt = i_height >> 2; cnt--; ) { LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += 4 * i_src_stride; LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref += 4 * i_ref_stride; ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, diff0, diff1, diff2, diff3 ); HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); diff0 = __msa_add_a_h( diff0, zero ); diff1 = __msa_add_a_h( diff1, zero ); diff2 = __msa_add_a_h( diff2, zero ); diff3 = __msa_add_a_h( diff3, zero ); diff0 = ( diff0 + diff1 + diff2 + diff3 ); diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 ); diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 ); u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 ); } return ( u_sum >> 1 ); } static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride, uint8_t i_height ) { int32_t cnt; uint32_t u_sum = 0; v16i8 src0, src1, src2, src3; v16i8 ref0, ref1, ref2, ref3; v8i16 zero = { 0 }; v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; v8i16 temp0, temp1, temp2, temp3; for( cnt = i_height >> 2; cnt--; ) { LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 ); p_src += 4 * i_src_stride; LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); p_ref += 4 * i_ref_stride; ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, diff0, diff1, diff2, diff3 ); HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 ); TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3, diff0, diff2, diff4, diff6 ); diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 ); diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 ); diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 ); diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); diff0 = __msa_add_a_h( diff0, zero ); diff1 = __msa_add_a_h( diff1, zero ); diff2 = __msa_add_a_h( diff2, zero ); diff3 = __msa_add_a_h( diff3, zero ); diff0 = ( diff0 + diff1 + diff2 + diff3 ); u_sum += HADD_UH_U32( diff0 ); } return ( u_sum >> 1 ); } static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_ref_stride ) { uint32_t u_sum = 0; v16i8 src0, src1, src2, src3, src4, src5, src6, src7; v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; v8i16 zero = { 0 }; v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7; v8i16 temp0, temp1, temp2, temp3; LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 ); ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1, sub2, sub3 ); ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5, sub6, sub7 ); HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 ); HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 ); TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 ); BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 ); BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); temp0 = diff0 + diff4; temp1 = diff1 + diff5; temp2 = diff2 + diff6; temp3 = diff3 + diff7; temp0 = __msa_add_a_h( temp0, zero ); temp1 = __msa_add_a_h( temp1, zero ); temp2 = __msa_add_a_h( temp2, zero ); temp3 = __msa_add_a_h( temp3, zero ); diff0 = temp0 + __msa_asub_s_h( diff0, diff4 ); diff1 = temp1 + __msa_asub_s_h( diff1, diff5 ); diff2 = temp2 + __msa_asub_s_h( diff2, diff6 ); diff3 = temp3 + __msa_asub_s_h( diff3, diff7 ); diff0 = ( diff0 + diff1 + diff2 + diff3 ); u_sum = HADD_UH_U32( diff0 ); return u_sum; } static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride ) { int16_t tmp0, tmp1, tmp2, tmp3; uint32_t u_sum4 = 0, u_sum8 = 0, u_dc; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 zero = { 0 }; v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7; v8i16 temp0, temp1, temp2, temp3; LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1, diff2, diff3 ); ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5, diff6, diff7 ); TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); tmp0 = diff0[0]; tmp1 = diff0[4]; tmp2 = diff4[0]; tmp3 = diff4[4]; sub0 = __msa_add_a_h( diff0, zero ); sub1 = __msa_add_a_h( diff1, zero ); sub2 = __msa_add_a_h( diff2, zero ); sub3 = __msa_add_a_h( diff3, zero ); sub4 = __msa_add_a_h( diff4, zero ); sub5 = __msa_add_a_h( diff5, zero ); sub6 = __msa_add_a_h( diff6, zero ); sub7 = __msa_add_a_h( diff7, zero ); sub0 = ( sub0 + sub1 + sub2 + sub3 ); sub1 = ( sub4 + sub5 + sub6 + sub7 ); sub0 += sub1; u_sum4 += HADD_UH_U32( sub0 ); TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 ); ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 ); ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 ); diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 ); diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 ); diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 ); diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 ); BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); sub0 = __msa_add_a_h( diff0, zero ); sub1 = __msa_add_a_h( diff1, zero ); sub2 = __msa_add_a_h( diff2, zero ); sub3 = __msa_add_a_h( diff3, zero ); sub4 = __msa_add_a_h( diff4, zero ); sub5 = __msa_add_a_h( diff5, zero ); sub6 = __msa_add_a_h( diff6, zero ); sub7 = __msa_add_a_h( diff7, zero ); sub0 = ( sub0 + sub1 + sub2 + sub3 ); sub1 = ( sub4 + sub5 + sub6 + sub7 ); sub0 += sub1; u_sum8 += HADD_UH_U32( sub0 ); u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 ); u_sum4 = u_sum4 - u_dc; u_sum8 = u_sum8 - u_dc; return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4; } int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); } int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); } int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); } int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); } int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); } int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); } int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); } int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); } void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ) { uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 16, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 8, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 16, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 8, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 4, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 8, ( uint32_t * ) p_sad_array ); } void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ) { sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, i_ref_stride, 4, ( uint32_t * ) p_sad_array ); } int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); } int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); } int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); } int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); } int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); } int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); } int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); } int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ) { return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); } void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_intra_predict_vert_4x4_msa( p_dec ); p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_hor_4x4_msa( p_dec ); p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_dc_4x4_msa( p_dec ); p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_intra_predict_vert_16x16_msa( p_dec ); p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_hor_16x16_msa( p_dec ); p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_dc_16x16_msa( p_dec ); p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ) { ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); x264_intra_predict_v_8x8_msa( pix, p_edge ); p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_h_8x8_msa( pix, p_edge ); p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_dc_8x8_msa( pix, p_edge ); p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_intra_predict_dc_4blk_8x8_msa( p_dec ); p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_hor_8x8_msa( p_dec ); p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_vert_8x8_msa( p_dec ); p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1, const uint8_t *p_pix2, intptr_t i_stride2, int32_t i_sums[2][4] ) { ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums ); } uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride ) { uint64_t u_sum; u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride ); u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride ); u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride ); return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); } int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 ); } int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 ); } int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 ); } int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 ); } int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 ); } int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 ); } int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { uint32_t u32Sum = 0; u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 ); u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride, p_pix2 + 8, i_stride2, 8 ); return u32Sum; } int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { uint32_t u32Sum = 0; u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 ); u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride, p_pix2 + 8, i_stride2, 16 ); return u32Sum; } int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ); return ( i32Sum + 2 ) >> 2; } int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ) { int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) + sa8d_8x8_msa( p_pix1 + 8, i_stride, p_pix2 + 8, i_stride2 ) + sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride, p_pix2 + 8 * i_stride2, i_stride2 ) + sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride, p_pix2 + 8 + 8 * i_stride2, i_stride2 ); return ( i32Sum + 2 ) >> 2; } void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_intra_predict_vert_4x4_msa( p_dec ); p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_hor_4x4_msa( p_dec ); p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_dc_4x4_msa( p_dec ); p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_intra_predict_vert_16x16_msa( p_dec ); p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_hor_16x16_msa( p_dec ); p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_dc_16x16_msa( p_dec ); p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ) { ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); x264_intra_predict_v_8x8_msa( pix, p_edge ); p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_h_8x8_msa( pix, p_edge ); p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_dc_8x8_msa( pix, p_edge ); p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE, p_enc, FENC_STRIDE ); } void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ) { x264_intra_predict_dc_4blk_8x8_msa( p_dec ); p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_hor_8x8_msa( p_dec ); p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); x264_intra_predict_vert_8x8_msa( p_dec ); p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE, p_enc, FENC_STRIDE ); } uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride ) { return avc_pixel_var16width_msa( p_pix, i_stride, 16 ); } uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride ) { return avc_pixel_var8width_msa( p_pix, i_stride, 16 ); } uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride ) { return avc_pixel_var8width_msa( p_pix, i_stride, 8 ); } int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1, uint8_t *p_pix2, intptr_t i_stride2, int32_t *p_ssd ) { int32_t i_var = 0, i_diff = 0, i_sqr = 0; i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16, &i_diff ); i_var = VARIANCE_WxH( i_sqr, i_diff, 7 ); *p_ssd = i_sqr; return i_var; } int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1, uint8_t *p_pix2, intptr_t i_stride2, int32_t *p_ssd ) { int32_t i_var = 0, i_diff = 0, i_sqr = 0; i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 8, &i_diff ); i_var = VARIANCE_WxH( i_sqr, i_diff, 6 ); *p_ssd = i_sqr; return i_var; } #endif x264-master/common/mips/pixel.h000066400000000000000000000346411502133446700166210ustar00rootroot00000000000000/***************************************************************************** * pixel.h: msa pixel metrics ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Mandar Sahastrabuddhe * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_PIXEL_H #define X264_MIPS_PIXEL_H #define x264_pixel_sad_16x16_msa x264_template(pixel_sad_16x16_msa) int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_16x8_msa x264_template(pixel_sad_16x8_msa) int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x16_msa x264_template(pixel_sad_8x16_msa) int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x8_msa x264_template(pixel_sad_8x8_msa) int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_8x4_msa x264_template(pixel_sad_8x4_msa) int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_4x16_msa x264_template(pixel_sad_4x16_msa) int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_4x8_msa x264_template(pixel_sad_4x8_msa) int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_4x4_msa x264_template(pixel_sad_4x4_msa) int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_sad_x4_16x16_msa x264_template(pixel_sad_x4_16x16_msa) void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_16x8_msa x264_template(pixel_sad_x4_16x8_msa) void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x16_msa x264_template(pixel_sad_x4_8x16_msa) void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x8_msa x264_template(pixel_sad_x4_8x8_msa) void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_8x4_msa x264_template(pixel_sad_x4_8x4_msa) void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_4x8_msa x264_template(pixel_sad_x4_4x8_msa) void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x4_4x4_msa x264_template(pixel_sad_x4_4x4_msa) void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, uint8_t *p_ref3, intptr_t i_ref_stride, int32_t p_sad_array[4] ); #define x264_pixel_sad_x3_16x16_msa x264_template(pixel_sad_x3_16x16_msa) void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_16x8_msa x264_template(pixel_sad_x3_16x8_msa) void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_8x16_msa x264_template(pixel_sad_x3_8x16_msa) void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_8x8_msa x264_template(pixel_sad_x3_8x8_msa) void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_8x4_msa x264_template(pixel_sad_x3_8x4_msa) void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_4x8_msa x264_template(pixel_sad_x3_4x8_msa) void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_sad_x3_4x4_msa x264_template(pixel_sad_x3_4x4_msa) void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, uint8_t *p_ref1, uint8_t *p_ref2, intptr_t i_ref_stride, int32_t p_sad_array[3] ); #define x264_pixel_ssd_16x16_msa x264_template(pixel_ssd_16x16_msa) int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_16x8_msa x264_template(pixel_ssd_16x8_msa) int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x16_msa x264_template(pixel_ssd_8x16_msa) int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x8_msa x264_template(pixel_ssd_8x8_msa) int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_8x4_msa x264_template(pixel_ssd_8x4_msa) int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_4x16_msa x264_template(pixel_ssd_4x16_msa) int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_4x8_msa x264_template(pixel_ssd_4x8_msa) int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_pixel_ssd_4x4_msa x264_template(pixel_ssd_4x4_msa) int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, uint8_t *p_ref, intptr_t i_ref_stride ); #define x264_intra_sad_x3_4x4_msa x264_template(intra_sad_x3_4x4_msa) void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_sad_x3_16x16_msa x264_template(intra_sad_x3_16x16_msa) void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_sad_x3_8x8_msa x264_template(intra_sad_x3_8x8_msa) void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ); #define x264_intra_sad_x3_8x8c_msa x264_template(intra_sad_x3_8x8c_msa) void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_ssim_4x4x2_core_msa x264_template(ssim_4x4x2_core_msa) void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1, const uint8_t *p_pix2, intptr_t i_stride2, int32_t i_sums[2][4] ); #define x264_pixel_hadamard_ac_8x8_msa x264_template(pixel_hadamard_ac_8x8_msa) uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_8x16_msa x264_template(pixel_hadamard_ac_8x16_msa) uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_16x8_msa x264_template(pixel_hadamard_ac_16x8_msa) uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_hadamard_ac_16x16_msa x264_template(pixel_hadamard_ac_16x16_msa) uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_satd_4x4_msa x264_template(pixel_satd_4x4_msa) int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_4x8_msa x264_template(pixel_satd_4x8_msa) int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_4x16_msa x264_template(pixel_satd_4x16_msa) int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x4_msa x264_template(pixel_satd_8x4_msa) int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x8_msa x264_template(pixel_satd_8x8_msa) int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_8x16_msa x264_template(pixel_satd_8x16_msa) int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_16x8_msa x264_template(pixel_satd_16x8_msa) int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_satd_16x16_msa x264_template(pixel_satd_16x16_msa) int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_sa8d_8x8_msa x264_template(pixel_sa8d_8x8_msa) int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_pixel_sa8d_16x16_msa x264_template(pixel_sa8d_16x16_msa) int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, uint8_t *p_pix2, intptr_t i_stride2 ); #define x264_intra_satd_x3_4x4_msa x264_template(intra_satd_x3_4x4_msa) void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_satd_x3_16x16_msa x264_template(intra_satd_x3_16x16_msa) void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_intra_sa8d_x3_8x8_msa x264_template(intra_sa8d_x3_8x8_msa) void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], int32_t p_sad_array[3] ); #define x264_intra_satd_x3_8x8c_msa x264_template(intra_satd_x3_8x8c_msa) void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, int32_t p_sad_array[3] ); #define x264_pixel_var_16x16_msa x264_template(pixel_var_16x16_msa) uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_var_8x16_msa x264_template(pixel_var_8x16_msa) uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_var_8x8_msa x264_template(pixel_var_8x8_msa) uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride ); #define x264_pixel_var2_8x16_msa x264_template(pixel_var2_8x16_msa) int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1, uint8_t *p_pix2, intptr_t i_stride2, int32_t *p_ssd ); #define x264_pixel_var2_8x8_msa x264_template(pixel_var2_8x8_msa) int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1, uint8_t *p_pix2, intptr_t i_stride2, int32_t *p_ssd ); #endif x264-master/common/mips/predict-c.c000066400000000000000000000477511502133446700173530ustar00rootroot00000000000000/***************************************************************************** * predict-c.c: msa intra prediction ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Mandar Sahastrabuddhe * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macros.h" #include "predict.h" #if !HIGH_BIT_DEPTH static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst, int32_t i_dst_stride ) { uint32_t u_src_data; u_src_data = LW( p_src ); SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride ); } static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst, int32_t i_dst_stride ) { uint64_t u_out; u_out = LD( p_src ); SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride ); } static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst, int32_t i_dst_stride ) { v16u8 src0 = LD_UB( p_src ); ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst, i_dst_stride ); p_dst += ( 8 * i_dst_stride ); ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst, i_dst_stride ); } static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride ) { uint32_t u_out0, u_out1, u_out2, u_out3; u_out0 = p_src[0 * i_src_stride] * 0x01010101; u_out1 = p_src[1 * i_src_stride] * 0x01010101; u_out2 = p_src[2 * i_src_stride] * 0x01010101; u_out3 = p_src[3 * i_src_stride] * 0x01010101; SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); } static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride ) { uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7; u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull; u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull; u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull; u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull; u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull; u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull; u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull; u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull; SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride ); } static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_dst, int32_t i_dst_stride ) { uint32_t u_row; uint8_t u_inp0, u_inp1, u_inp2, u_inp3; v16u8 src0, src1, src2, src3; for( u_row = 4; u_row--; ) { u_inp0 = p_src[0]; p_src += i_src_stride; u_inp1 = p_src[0]; p_src += i_src_stride; u_inp2 = p_src[0]; p_src += i_src_stride; u_inp3 = p_src[0]; p_src += i_src_stride; src0 = ( v16u8 ) __msa_fill_b( u_inp0 ); src1 = ( v16u8 ) __msa_fill_b( u_inp1 ); src2 = ( v16u8 ) __msa_fill_b( u_inp2 ); src3 = ( v16u8 ) __msa_fill_b( u_inp3 ); ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); } } static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left, int32_t i_src_stride_left, uint8_t *p_dst, int32_t i_dst_stride, uint8_t is_above, uint8_t is_left ) { uint32_t u_row; uint32_t u_out, u_addition = 0; v16u8 src_above, store; v8u16 sum_above; v4u32 sum; if( is_left && is_above ) { src_above = LD_UB( p_src_top ); sum_above = __msa_hadd_u_h( src_above, src_above ); sum = __msa_hadd_u_w( sum_above, sum_above ); u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 ); for( u_row = 0; u_row < 4; u_row++ ) { u_addition += p_src_left[u_row * i_src_stride_left]; } u_addition = ( u_addition + 4 ) >> 3; store = ( v16u8 ) __msa_fill_b( u_addition ); } else if( is_left ) { for( u_row = 0; u_row < 4; u_row++ ) { u_addition += p_src_left[u_row * i_src_stride_left]; } u_addition = ( u_addition + 2 ) >> 2; store = ( v16u8 ) __msa_fill_b( u_addition ); } else if( is_above ) { src_above = LD_UB( p_src_top ); sum_above = __msa_hadd_u_h( src_above, src_above ); sum = __msa_hadd_u_w( sum_above, sum_above ); sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 ); store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 ); } else { store = ( v16u8 ) __msa_ldi_b( 128 ); } u_out = __msa_copy_u_w( ( v4i32 ) store, 0 ); SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride ); } static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left, uint8_t *p_dst, int32_t i_dst_stride ) { uint64_t u_val0, u_val1; v16i8 store; v16u8 src = { 0 }; v8u16 sum_h; v4u32 sum_w; v2u64 sum_d; u_val0 = LD( p_src_top ); u_val1 = LD( p_src_left ); INSERT_D2_UB( u_val0, u_val1, src ); sum_h = __msa_hadd_u_h( src, src ); sum_w = __msa_hadd_u_w( sum_h, sum_h ); sum_d = __msa_hadd_u_d( sum_w, sum_w ); sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d ); sum_d = __msa_hadd_u_d( sum_w, sum_w ); sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 ); store = __msa_splati_b( ( v16i8 ) sum_w, 0 ); u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 ); SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride ); } static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left, int32_t i_src_stride_left, uint8_t *p_dst, int32_t i_dst_stride, uint8_t is_above, uint8_t is_left ) { uint32_t u_row; uint32_t u_addition = 0; v16u8 src_above, store; v8u16 sum_above; v4u32 sum_top; v2u64 sum; if( is_left && is_above ) { src_above = LD_UB( p_src_top ); sum_above = __msa_hadd_u_h( src_above, src_above ); sum_top = __msa_hadd_u_w( sum_above, sum_above ); sum = __msa_hadd_u_d( sum_top, sum_top ); sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum ); sum = __msa_hadd_u_d( sum_top, sum_top ); u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 ); for( u_row = 0; u_row < 16; u_row++ ) { u_addition += p_src_left[u_row * i_src_stride_left]; } u_addition = ( u_addition + 16 ) >> 5; store = ( v16u8 ) __msa_fill_b( u_addition ); } else if( is_left ) { for( u_row = 0; u_row < 16; u_row++ ) { u_addition += p_src_left[u_row * i_src_stride_left]; } u_addition = ( u_addition + 8 ) >> 4; store = ( v16u8 ) __msa_fill_b( u_addition ); } else if( is_above ) { src_above = LD_UB( p_src_top ); sum_above = __msa_hadd_u_h( src_above, src_above ); sum_top = __msa_hadd_u_w( sum_above, sum_above ); sum = __msa_hadd_u_d( sum_top, sum_top ); sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum ); sum = __msa_hadd_u_d( sum_top, sum_top ); sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 ); store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 ); } else { store = ( v16u8 ) __msa_ldi_b( 128 ); } ST_UB8( store, store, store, store, store, store, store, store, p_dst, i_dst_stride ); p_dst += ( 8 * i_dst_stride ); ST_UB8( store, store, store, store, store, store, store, store, p_dst, i_dst_stride ); } static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride ) { uint8_t u_lpcnt; int32_t i_res, i_res0, i_res1, i_res2, i_res3; uint64_t u_out0, u_out1; v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 }; v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 }; v4i32 int_multiplier = { 0, 1, 2, 3 }; v16u8 p_src_top; v8i16 vec9, vec10, vec11; v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8; v2i64 sum; p_src_top = LD_UB( p_src - ( i_stride + 1 ) ); p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top, ( v16i8 ) p_src_top ); vec9 = __msa_hsub_u_h( p_src_top, p_src_top ); vec9 *= short_multiplier; vec8 = __msa_hadd_s_w( vec9, vec9 ); sum = __msa_hadd_s_d( vec8, vec8 ); i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 ); i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) + 2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) + 3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) + 4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] ); i_res0 *= 17; i_res1 *= 17; i_res0 = ( i_res0 + 16 ) >> 5; i_res1 = ( i_res1 + 16 ) >> 5; i_res3 = 3 * ( i_res0 + i_res1 ); i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 ); i_res = i_res2 - i_res3; vec8 = __msa_fill_w( i_res0 ); vec4 = __msa_fill_w( i_res ); vec2 = __msa_fill_w( i_res1 ); vec5 = vec8 * int_multiplier; vec3 = vec8 * 4; for( u_lpcnt = 4; u_lpcnt--; ) { vec0 = vec5; vec0 += vec4; vec1 = vec0 + vec3; vec6 = vec5; vec4 += vec2; vec6 += vec4; vec7 = vec6 + vec3; SRA_4V( vec0, vec1, vec6, vec7, 5 ); PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 ); CLIP_SH2_0_255( vec10, vec11 ); PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 ); u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 ); u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 ); SD( u_out0, p_src ); p_src += i_stride; SD( u_out1, p_src ); p_src += i_stride; vec4 += vec2; } } static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride ) { uint8_t u_lpcnt; int32_t i_res0, i_res1, i_res2, i_res3; uint64_t u_load0, u_load1; v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 }; v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 }; v4i32 int_multiplier = { 0, 1, 2, 3 }; v16u8 p_src_top = { 0 }; v8i16 vec9, vec10; v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add; u_load0 = LD( p_src - ( i_stride + 1 ) ); u_load1 = LD( p_src - ( i_stride + 1 ) + 9 ); INSERT_D2_UB( u_load0, u_load1, p_src_top ); p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top, ( v16i8 ) p_src_top ); vec9 = __msa_hsub_u_h( p_src_top, p_src_top ); vec9 *= short_multiplier; vec8 = __msa_hadd_s_w( vec9, vec9 ); res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 ); i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 ); i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) + 2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) + 3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) + 4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) + 5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) + 6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) + 7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) + 8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] ); i_res0 *= 5; i_res1 *= 5; i_res0 = ( i_res0 + 32 ) >> 6; i_res1 = ( i_res1 + 32 ) >> 6; i_res3 = 7 * ( i_res0 + i_res1 ); i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 ); i_res2 -= i_res3; vec8 = __msa_fill_w( i_res0 ); vec4 = __msa_fill_w( i_res2 ); vec5 = __msa_fill_w( i_res1 ); vec6 = vec8 * 4; vec7 = vec8 * int_multiplier; for( u_lpcnt = 16; u_lpcnt--; ) { vec0 = vec7; vec0 += vec4; vec1 = vec0 + vec6; vec2 = vec1 + vec6; vec3 = vec2 + vec6; SRA_4V( vec0, vec1, vec2, vec3, 5 ); PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 ); CLIP_SH2_0_255( vec9, vec10 ); PCKEV_ST_SB( vec9, vec10, p_src ); p_src += i_stride; vec4 += vec5; } } static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride ) { uint8_t u_lp_cnt; uint32_t u_src0, u_src1, u_src3, u_src2 = 0; uint32_t u_out0, u_out1, u_out2, u_out3; v16u8 p_src_top; v8u16 add; v4u32 sum; p_src_top = LD_UB( p_src - i_stride ); add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top ); sum = __msa_hadd_u_w( add, add ); u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 ); u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 ); for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ ) { u_src0 += p_src[u_lp_cnt * i_stride - 1]; u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1]; } u_src0 = ( u_src0 + 4 ) >> 3; u_src3 = ( u_src1 + u_src2 + 4 ) >> 3; u_src1 = ( u_src1 + 2 ) >> 2; u_src2 = ( u_src2 + 2 ) >> 2; u_out0 = u_src0 * 0x01010101; u_out1 = u_src1 * 0x01010101; u_out2 = u_src2 * 0x01010101; u_out3 = u_src3 * 0x01010101; for( u_lp_cnt = 4; u_lp_cnt--; ) { SW( u_out0, p_src ); SW( u_out1, ( p_src + 4 ) ); SW( u_out2, ( p_src + 4 * i_stride ) ); SW( u_out3, ( p_src + 4 * i_stride + 4 ) ); p_src += i_stride; } } static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst, int32_t i_dst_stride ) { uint8_t u_src_val = p_src[15]; uint64_t u_out0, u_out1, u_out2, u_out3; v16u8 src, vec4, vec5, res0; v8u16 vec0, vec1, vec2, vec3; v2i64 res1, res2, res3; src = LD_UB( p_src ); vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 ); vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 ); vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val ); ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 ); ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 ); HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 ); vec0 += vec1; vec2 += vec3; vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 ); vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 ); res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 ); res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 ); res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 ); res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 ); u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 ); u_out1 = __msa_copy_u_d( res1, 0 ); u_out2 = __msa_copy_u_d( res2, 0 ); u_out3 = __msa_copy_u_d( res3, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 ); res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 ); res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 ); res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 ); u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 ); u_out1 = __msa_copy_u_d( res1, 0 ); u_out2 = __msa_copy_u_d( res2, 0 ); u_out3 = __msa_copy_u_d( res3, 0 ); SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); } static void intra_predict_128dc_16x16_msa( uint8_t *p_dst, int32_t i_dst_stride ) { v16u8 out = ( v16u8 ) __msa_ldi_b( 128 ); ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride ); p_dst += ( 8 * i_dst_stride ); ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride ); } void x264_intra_predict_dc_16x16_msa( uint8_t *p_src ) { intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 ); } void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src ) { intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 ); } void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src ) { intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 ); } void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src ) { intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE ); } void x264_intra_predict_hor_16x16_msa( uint8_t *p_src ) { intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE ); } void x264_intra_predict_vert_16x16_msa( uint8_t *p_src ) { intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE ); } void x264_intra_predict_plane_16x16_msa( uint8_t *p_src ) { intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE ); } void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src ) { intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE ); } void x264_intra_predict_hor_8x8_msa( uint8_t *p_src ) { intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE ); } void x264_intra_predict_vert_8x8_msa( uint8_t *p_src ) { intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE ); } void x264_intra_predict_plane_8x8_msa( uint8_t *p_src ) { intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE ); } void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) { intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE ); } void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) { intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ), p_src, FDEC_STRIDE ); } void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) { intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE ); } void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) { intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE ); } void x264_intra_predict_dc_4x4_msa( uint8_t *p_src ) { intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 ); } void x264_intra_predict_hor_4x4_msa( uint8_t *p_src ) { intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE, p_src, FDEC_STRIDE ); } void x264_intra_predict_vert_4x4_msa( uint8_t *p_src ) { intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE ); } #endif x264-master/common/mips/predict.h000066400000000000000000000074541502133446700171340ustar00rootroot00000000000000/***************************************************************************** * predict.h: msa intra prediction ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Rishikesh More * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_PREDICT_H #define X264_MIPS_PREDICT_H #define x264_intra_predict_dc_16x16_msa x264_template(intra_predict_dc_16x16_msa) void x264_intra_predict_dc_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_dc_left_16x16_msa x264_template(intra_predict_dc_left_16x16_msa) void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_dc_top_16x16_msa x264_template(intra_predict_dc_top_16x16_msa) void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_dc_128_16x16_msa x264_template(intra_predict_dc_128_16x16_msa) void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_hor_16x16_msa x264_template(intra_predict_hor_16x16_msa) void x264_intra_predict_hor_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_vert_16x16_msa x264_template(intra_predict_vert_16x16_msa) void x264_intra_predict_vert_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_plane_16x16_msa x264_template(intra_predict_plane_16x16_msa) void x264_intra_predict_plane_16x16_msa( uint8_t *p_src ); #define x264_intra_predict_dc_4blk_8x8_msa x264_template(intra_predict_dc_4blk_8x8_msa) void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src ); #define x264_intra_predict_hor_8x8_msa x264_template(intra_predict_hor_8x8_msa) void x264_intra_predict_hor_8x8_msa( uint8_t *p_src ); #define x264_intra_predict_vert_8x8_msa x264_template(intra_predict_vert_8x8_msa) void x264_intra_predict_vert_8x8_msa( uint8_t *p_src ); #define x264_intra_predict_plane_8x8_msa x264_template(intra_predict_plane_8x8_msa) void x264_intra_predict_plane_8x8_msa( uint8_t *p_src ); #define x264_intra_predict_ddl_8x8_msa x264_template(intra_predict_ddl_8x8_msa) void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); #define x264_intra_predict_dc_8x8_msa x264_template(intra_predict_dc_8x8_msa) void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); #define x264_intra_predict_h_8x8_msa x264_template(intra_predict_h_8x8_msa) void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); #define x264_intra_predict_v_8x8_msa x264_template(intra_predict_v_8x8_msa) void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); #define x264_intra_predict_dc_4x4_msa x264_template(intra_predict_dc_4x4_msa) void x264_intra_predict_dc_4x4_msa( uint8_t *p_src ); #define x264_intra_predict_hor_4x4_msa x264_template(intra_predict_hor_4x4_msa) void x264_intra_predict_hor_4x4_msa( uint8_t *p_src ); #define x264_intra_predict_vert_4x4_msa x264_template(intra_predict_vert_4x4_msa) void x264_intra_predict_vert_4x4_msa( uint8_t *p_src ); #endif x264-master/common/mips/quant-c.c000066400000000000000000000563621502133446700170470ustar00rootroot00000000000000/***************************************************************************** * quant-c.c: msa quantization and level-run ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Rishikesh More * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macros.h" #include "quant.h" #if !HIGH_BIT_DEPTH static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 4; v8i16 dct0, dct1; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; LD_SH2( p_dct, 8, dct0, dct1 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); if( q_bits >= 0 ) { v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_mf_h0, dequant_mf_h1 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct0 <<= q_bits_vec; dct1 <<= q_bits_vec; ST_SH2( dct0, dct1, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct0, dct1 ); ST_SH2( dct0, dct1, p_dct, 8 ); } } static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ) { const int32_t i_mf = i_qp % 6; const int32_t q_bits = i_qp / 6 - 6; v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7; v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7; v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11; v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15; LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 ); LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 ); LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 ); LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 ); LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 ); LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 ); LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 ); if( q_bits >= 0 ) { v8i16 q_bits_vec; v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3; v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7; q_bits_vec = __msa_fill_h( q_bits ); PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6, dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3 ); PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10, dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14, dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7 ); dct0 *= dequant_mf_h0; dct1 *= dequant_mf_h1; dct2 *= dequant_mf_h2; dct3 *= dequant_mf_h3; dct4 *= dequant_mf_h4; dct5 *= dequant_mf_h5; dct6 *= dequant_mf_h6; dct7 *= dequant_mf_h7; SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec ); SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11; v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15; v4i32 q_bits_vec, q_bits_vec_add; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 ); UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 ); UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 ); UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 ); dct_signed_w0 *= dequant_m_f0; dct_signed_w1 *= dequant_m_f1; dct_signed_w2 *= dequant_m_f2; dct_signed_w3 *= dequant_m_f3; dct_signed_w4 *= dequant_m_f4; dct_signed_w5 *= dequant_m_f5; dct_signed_w6 *= dequant_m_f6; dct_signed_w7 *= dequant_m_f7; dct_signed_w8 *= dequant_m_f8; dct_signed_w9 *= dequant_m_f9; dct_signed_w10 *= dequant_m_f10; dct_signed_w11 *= dequant_m_f11; dct_signed_w12 *= dequant_m_f12; dct_signed_w13 *= dequant_m_f13; dct_signed_w14 *= dequant_m_f14; dct_signed_w15 *= dequant_m_f15; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; dct_signed_w4 += q_bits_vec_add; dct_signed_w5 += q_bits_vec_add; dct_signed_w6 += q_bits_vec_add; dct_signed_w7 += q_bits_vec_add; dct_signed_w8 += q_bits_vec_add; dct_signed_w9 += q_bits_vec_add; dct_signed_w10 += q_bits_vec_add; dct_signed_w11 += q_bits_vec_add; dct_signed_w12 += q_bits_vec_add; dct_signed_w13 += q_bits_vec_add; dct_signed_w14 += q_bits_vec_add; dct_signed_w15 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7, q_bits_vec ); SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11, q_bits_vec ); SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15, q_bits_vec ); PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6, dct0, dct1, dct2, dct3 ); PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11, dct_signed_w10, dct_signed_w13, dct_signed_w12, dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 ); ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); } } static void avc_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { const int32_t q_bits = i_qp / 6 - 6; int32_t i_dmf = pi_dequant_mf[i_qp % 6][0]; v8i16 dct0, dct1, dequant_mf_h; LD_SH2( p_dct, 8, dct0, dct1 ); if( q_bits >= 0 ) { i_dmf <<= q_bits; dequant_mf_h = __msa_fill_h( i_dmf ); dct0 = dct0 * dequant_mf_h; dct1 = dct1 * dequant_mf_h; ST_SH2( dct0, dct1, p_dct, 8 ); } else { const int32_t q_bits_add = 1 << ( -q_bits - 1 ); v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; q_bits_vec_add = __msa_fill_w( q_bits_add ); q_bits_vec = __msa_fill_w( -q_bits ); dequant_m_f = __msa_fill_w( i_dmf ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); dct_signed_w0 *= dequant_m_f; dct_signed_w1 *= dequant_m_f; dct_signed_w2 *= dequant_m_f; dct_signed_w3 *= dequant_m_f; dct_signed_w0 += q_bits_vec_add; dct_signed_w1 += q_bits_vec_add; dct_signed_w2 += q_bits_vec_add; dct_signed_w3 += q_bits_vec_add; SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, q_bits_vec ); PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, dct0, dct1 ); ST_SH2( dct0, dct1, p_dct, 8 ); } } static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask; v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 bias0, bias1, bias2, bias3; LD_SH2( p_dct, 8, dct0, dct1 ); LD_SH2( p_bias, 8, bias_h0, bias_h1 ); LD_SH2( p_mf, 8, mf_h0, mf_h1 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 ); ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 ); ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 ); ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); dct0 = zero - dct_h0; dct1 = zero - dct_h1; dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); ST_SH2( dct0, dct1, p_dct, 8 ); return !!non_zero; } static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct2, dct3; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask; v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3; v8i16 bias_h0, bias_h1, bias_h2, bias_h3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7; v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7; LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6, dct_h0, dct_h1, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 ); LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 ); return !!non_zero; } static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct0_mask, dct1_mask; v8i16 zero = { 0 }; v8i16 dct_h0, dct_h1; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3; v4i32 mf_vec, bias_vec; LD_SH2( p_dct, 8, dct0, dct1 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); bias_vec = __msa_fill_w( i_bias ); mf_vec = __msa_fill_w( i_mf ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec ); dct_w0 *= mf_vec; dct_w1 *= mf_vec; dct_w2 *= mf_vec; dct_w3 *= mf_vec; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); dct0 = zero - dct_h0; dct1 = zero - dct_h1; dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); ST_SH2( dct0, dct1, p_dct, 8 ); return !!non_zero; } static int32_t avc_coeff_last64_msa( int16_t *p_src ) { uint32_t u_res; v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7; v16u8 tmp0, tmp1, tmp2, tmp3; v8u16 vec0, vec1, vec2, vec3; v4i32 out0; v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); tmp_h0 = __msa_ceqi_h( src0, 0 ); tmp_h1 = __msa_ceqi_h( src1, 0 ); tmp_h2 = __msa_ceqi_h( src2, 0 ); tmp_h3 = __msa_ceqi_h( src3, 0 ); tmp_h4 = __msa_ceqi_h( src4, 0 ); tmp_h5 = __msa_ceqi_h( src5, 0 ); tmp_h6 = __msa_ceqi_h( src6, 0 ); tmp_h7 = __msa_ceqi_h( src7, 0 ); PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6, tmp0, tmp1, tmp2, tmp3 ); tmp0 = tmp0 & mask; tmp1 = tmp1 & mask; tmp2 = tmp2 & mask; tmp3 = tmp3 & mask; HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 ); PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 ); HADD_UB2_UH( tmp0, tmp1, vec0, vec1 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 ); vec0 = __msa_hadd_u_h( tmp0, tmp0 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 ); out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 ); u_res = __msa_copy_u_w( out0, 0 ); return ( 63 - u_res ); } static int32_t avc_coeff_last16_msa( int16_t *p_src ) { uint32_t u_res; v8i16 src0, src1; v8u16 tmp_h0; v16u8 tmp0; v8i16 out0, out1; v16i8 res0; v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; LD_SH2( p_src, 8, src0, src1 ); out0 = __msa_ceqi_h( src0, 0 ); out1 = __msa_ceqi_h( src1, 0 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 ); tmp0 = tmp0 & mask; tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 ); tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 ); tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 ); tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 ); res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 ); out0 = __msa_nloc_h( ( v8i16 ) res0 ); u_res = __msa_copy_u_h( out0, 0 ); return ( 15 - u_res ); } void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp ); } void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ) { avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp ); } void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ) { avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp ); } int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { return avc_quant_4x4_msa( p_dct, p_mf, p_bias ); } int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16], uint16_t pu_mf[16], uint16_t pu_bias[16] ) { int32_t i_non_zero, i_non_zero_acc = 0; for( int32_t j = 0; j < 4; j++ ) { i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias ); i_non_zero_acc |= ( !!i_non_zero ) << j; } return i_non_zero_acc; } int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { return avc_quant_8x8_msa( p_dct, p_mf, p_bias ); } int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ) { return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias ); } int32_t x264_coeff_last64_msa( int16_t *p_src ) { return avc_coeff_last64_msa( p_src ); } int32_t x264_coeff_last16_msa( int16_t *p_src ) { return avc_coeff_last16_msa( p_src ); } #endif x264-master/common/mips/quant.h000066400000000000000000000051371502133446700166260ustar00rootroot00000000000000/***************************************************************************** * quant.h: msa quantization and level-run ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Rishikesh More * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MIPS_QUANT_H #define X264_MIPS_QUANT_H #define x264_dequant_4x4_msa x264_template(dequant_4x4_msa) void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ); #define x264_dequant_8x8_msa x264_template(dequant_8x8_msa) void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], int32_t i_qp ); #define x264_dequant_4x4_dc_msa x264_template(dequant_4x4_dc_msa) void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], int32_t i_qp ); #define x264_quant_4x4_msa x264_template(quant_4x4_msa) int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); #define x264_quant_4x4x4_msa x264_template(quant_4x4x4_msa) int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16], uint16_t pu_mf[16], uint16_t pu_bias[16] ); #define x264_quant_8x8_msa x264_template(quant_8x8_msa) int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); #define x264_quant_4x4_dc_msa x264_template(quant_4x4_dc_msa) int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ); #define x264_coeff_last64_msa x264_template(coeff_last64_msa) int32_t x264_coeff_last64_msa( int16_t *p_src ); #define x264_coeff_last16_msa x264_template(coeff_last16_msa) int32_t x264_coeff_last16_msa( int16_t *p_src ); #endif x264-master/common/mvpred.c000066400000000000000000000542731502133446700160230ustar00rootroot00000000000000/***************************************************************************** * mvpred.c: motion vector prediction ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] ) { const int i8 = x264_scan8[idx]; const int i_ref= h->mb.cache.ref[i_list][i8]; int i_refa = h->mb.cache.ref[i_list][i8 - 1]; int16_t *mv_a = h->mb.cache.mv[i_list][i8 - 1]; int i_refb = h->mb.cache.ref[i_list][i8 - 8]; int16_t *mv_b = h->mb.cache.mv[i_list][i8 - 8]; int i_refc = h->mb.cache.ref[i_list][i8 - 8 + i_width]; int16_t *mv_c = h->mb.cache.mv[i_list][i8 - 8 + i_width]; // Partitions not yet reached in scan order are unavailable. if( (idx&3) >= 2 + (i_width&1) || i_refc == -2 ) { i_refc = h->mb.cache.ref[i_list][i8 - 8 - 1]; mv_c = h->mb.cache.mv[i_list][i8 - 8 - 1]; if( SLICE_MBAFF && h->mb.cache.ref[i_list][x264_scan8[0]-1] != -2 && MB_INTERLACED != h->mb.field[h->mb.i_mb_left_xy[0]] ) { if( idx == 2 ) { mv_c = h->mb.cache.topright_mv[i_list][0]; i_refc = h->mb.cache.topright_ref[i_list][0]; } else if( idx == 8 ) { mv_c = h->mb.cache.topright_mv[i_list][1]; i_refc = h->mb.cache.topright_ref[i_list][1]; } else if( idx == 10 ) { mv_c = h->mb.cache.topright_mv[i_list][2]; i_refc = h->mb.cache.topright_ref[i_list][2]; } } } if( h->mb.i_partition == D_16x8 ) { if( idx == 0 ) { if( i_refb == i_ref ) { CP32( mvp, mv_b ); return; } } else { if( i_refa == i_ref ) { CP32( mvp, mv_a ); return; } } } else if( h->mb.i_partition == D_8x16 ) { if( idx == 0 ) { if( i_refa == i_ref ) { CP32( mvp, mv_a ); return; } } else { if( i_refc == i_ref ) { CP32( mvp, mv_c ); return; } } } int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); if( i_count > 1 ) { median: x264_median_mv( mvp, mv_a, mv_b, mv_c ); } else if( i_count == 1 ) { if( i_refa == i_ref ) CP32( mvp, mv_a ); else if( i_refb == i_ref ) CP32( mvp, mv_b ); else CP32( mvp, mv_c ); } else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) CP32( mvp, mv_a ); else goto median; } void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] ) { int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; if( i_refc == -2 ) { i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; } int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); if( i_count > 1 ) { median: x264_median_mv( mvp, mv_a, mv_b, mv_c ); } else if( i_count == 1 ) { if( i_refa == i_ref ) CP32( mvp, mv_a ); else if( i_refb == i_ref ) CP32( mvp, mv_b ); else CP32( mvp, mv_c ); } else if( i_refb == -2 && i_refc == -2 && i_refa != -2 ) CP32( mvp, mv_a ); else goto median; } void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] ) { int i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1]; int i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8]; int16_t *mv_a = h->mb.cache.mv[0][X264_SCAN8_0 - 1]; int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8]; if( i_refa == -2 || i_refb == -2 || !( (uint32_t)i_refa | M32( mv_a ) ) || !( (uint32_t)i_refb | M32( mv_b ) ) ) { M32( mv ) = 0; } else x264_mb_predict_mv_16x16( h, 0, 0, mv ); } static int mb_predict_mv_direct16x16_temporal( x264_t *h ) { int mb_x = h->mb.i_mb_x; int mb_y = h->mb.i_mb_y; int mb_xy = h->mb.i_mb_xy; int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] }; int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] }; int preshift = MB_INTERLACED; int postshift = MB_INTERLACED; int offset = 1; int yshift = 1; h->mb.i_partition = partition_col[0]; if( PARAM_INTERLACED && h->fref[1][0]->field[mb_xy] != MB_INTERLACED ) { if( MB_INTERLACED ) { mb_y = h->mb.i_mb_y&~1; mb_xy = mb_x + h->mb.i_mb_stride * mb_y; type_col[0] = h->fref[1][0]->mb_type[mb_xy]; type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride]; partition_col[0] = h->fref[1][0]->mb_partition[mb_xy]; partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride]; preshift = 0; yshift = 0; if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) && (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) && partition_col[0] != D_8x8 ) h->mb.i_partition = D_16x8; else h->mb.i_partition = D_8x8; } else { int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1]; int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc) >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc); mb_y = (h->mb.i_mb_y&~1) + col_parity; mb_xy = mb_x + h->mb.i_mb_stride * mb_y; type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy]; partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy]; preshift = 1; yshift = 2; h->mb.i_partition = partition_col[0]; } offset = 0; } int i_mb_4x4 = 16 * h->mb.i_mb_stride * mb_y + 4 * mb_x; int i_mb_8x8 = 4 * h->mb.i_mb_stride * mb_y + 2 * mb_x; x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); /* Don't do any checks other than the ones we have to, based * on the size of the colocated partitions. * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ int max_i8 = (D_16x16 - h->mb.i_partition) + 1; int step = (h->mb.i_partition == D_16x8) + 1; int width = 4 >> ((D_16x16 - h->mb.i_partition)&1); int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1); for( int i8 = 0; i8 < max_i8; i8 += step ) { int x8 = i8&1; int y8 = i8>>1; int ypart = (SLICE_MBAFF && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ? MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 : 3*y8; if( IS_INTRA( type_col[y8] ) ) { x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, 0 ); x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 ); x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 ); continue; } int i_part_8x8 = i_mb_8x8 + x8 + (ypart>>1) * h->mb.i_b8_stride; int i_ref1_ref = h->fref[1][0]->ref[0][i_part_8x8]; int i_ref = (map_col_to_list0(i_ref1_ref>>preshift) * (1 << postshift)) + (offset&i_ref1_ref&MB_INTERLACED); if( i_ref >= 0 ) { int dist_scale_factor = h->mb.dist_scale_factor[i_ref][0]; int16_t *mv_col = h->fref[1][0]->mv[0][i_mb_4x4 + 3*x8 + ypart * h->mb.i_b4_stride]; int16_t mv_y = (mv_col[1] * (1 << yshift)) / 2; int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8; int l0y = ( dist_scale_factor * mv_y + 128 ) >> 8; if( h->param.i_threads > 1 && (l0y > h->mb.mv_max_spel[1] || l0y-mv_y > h->mb.mv_max_spel[1]) ) return 0; x264_macroblock_cache_ref( h, 2*x8, 2*y8, width, height, 0, i_ref ); x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, pack16to32_mask(l0x, l0y) ); x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_y) ); } else { /* the collocated ref isn't in the current list0 */ /* FIXME: we might still be able to use direct_8x8 on some partitions */ /* FIXME: with B-pyramid + extensive ref list reordering * (not currently used), we would also have to check * l1mv1 like in spatial mode */ return 0; } } return 1; } static ALWAYS_INLINE int mb_predict_mv_direct16x16_spatial( x264_t *h, int b_interlaced ) { int8_t ref[2]; ALIGNED_ARRAY_8( int16_t, mv,[2],[2] ); for( int i_list = 0; i_list < 2; i_list++ ) { int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1]; int16_t *mv_a = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1]; int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8]; int16_t *mv_b = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8]; int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4]; int16_t *mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 + 4]; if( i_refc == -2 ) { i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; } int i_ref = (int)X264_MIN3( (unsigned)i_refa, (unsigned)i_refb, (unsigned)i_refc ); if( i_ref < 0 ) { i_ref = -1; M32( mv[i_list] ) = 0; } else { /* Same as x264_mb_predict_mv_16x16, but simplified to eliminate cases * not relevant to spatial direct. */ int i_count = (i_refa == i_ref) + (i_refb == i_ref) + (i_refc == i_ref); if( i_count > 1 ) x264_median_mv( mv[i_list], mv_a, mv_b, mv_c ); else { if( i_refa == i_ref ) CP32( mv[i_list], mv_a ); else if( i_refb == i_ref ) CP32( mv[i_list], mv_b ); else CP32( mv[i_list], mv_c ); } } x264_macroblock_cache_ref( h, 0, 0, 4, 4, i_list, i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, i_list, mv[i_list] ); ref[i_list] = i_ref; } int mb_x = h->mb.i_mb_x; int mb_y = h->mb.i_mb_y; int mb_xy = h->mb.i_mb_xy; int type_col[2] = { h->fref[1][0]->mb_type[mb_xy], h->fref[1][0]->mb_type[mb_xy] }; int partition_col[2] = { h->fref[1][0]->mb_partition[mb_xy], h->fref[1][0]->mb_partition[mb_xy] }; h->mb.i_partition = partition_col[0]; if( b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED ) { if( MB_INTERLACED ) { mb_y = h->mb.i_mb_y&~1; mb_xy = mb_x + h->mb.i_mb_stride * mb_y; type_col[0] = h->fref[1][0]->mb_type[mb_xy]; type_col[1] = h->fref[1][0]->mb_type[mb_xy + h->mb.i_mb_stride]; partition_col[0] = h->fref[1][0]->mb_partition[mb_xy]; partition_col[1] = h->fref[1][0]->mb_partition[mb_xy + h->mb.i_mb_stride]; if( (IS_INTRA(type_col[0]) || partition_col[0] == D_16x16) && (IS_INTRA(type_col[1]) || partition_col[1] == D_16x16) && partition_col[0] != D_8x8 ) h->mb.i_partition = D_16x8; else h->mb.i_partition = D_8x8; } else { int cur_poc = h->fdec->i_poc + h->fdec->i_delta_poc[MB_INTERLACED&h->mb.i_mb_y&1]; int col_parity = abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[0] - cur_poc) >= abs(h->fref[1][0]->i_poc + h->fref[1][0]->i_delta_poc[1] - cur_poc); mb_y = (h->mb.i_mb_y&~1) + col_parity; mb_xy = mb_x + h->mb.i_mb_stride * mb_y; type_col[0] = type_col[1] = h->fref[1][0]->mb_type[mb_xy]; partition_col[0] = partition_col[1] = h->fref[1][0]->mb_partition[mb_xy]; h->mb.i_partition = partition_col[0]; } } int i_mb_4x4 = b_interlaced ? 4 * (h->mb.i_b4_stride*mb_y + mb_x) : h->mb.i_b4_xy; int i_mb_8x8 = b_interlaced ? 2 * (h->mb.i_b8_stride*mb_y + mb_x) : h->mb.i_b8_xy; int8_t *l1ref0 = &h->fref[1][0]->ref[0][i_mb_8x8]; int8_t *l1ref1 = &h->fref[1][0]->ref[1][i_mb_8x8]; int16_t (*l1mv[2])[2] = { (int16_t (*)[2]) &h->fref[1][0]->mv[0][i_mb_4x4], (int16_t (*)[2]) &h->fref[1][0]->mv[1][i_mb_4x4] }; if( (M16( ref ) & 0x8080) == 0x8080 ) /* if( ref[0] < 0 && ref[1] < 0 ) */ { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 ); return 1; } if( h->param.i_threads > 1 && ( mv[0][1] > h->mb.mv_max_spel[1] || mv[1][1] > h->mb.mv_max_spel[1] ) ) { #if 0 fprintf(stderr, "direct_spatial: (%d,%d) (%d,%d) > %d \n", mv[0][0], mv[0][1], mv[1][0], mv[1][1], h->mb.mv_max_spel[1]); #endif return 0; } if( !M64( mv ) || (!b_interlaced && IS_INTRA( type_col[0] )) || (ref[0]&&ref[1]) ) return 1; /* Don't do any checks other than the ones we have to, based * on the size of the colocated partitions. * Depends on the enum order: D_8x8, D_16x8, D_8x16, D_16x16 */ int max_i8 = (D_16x16 - h->mb.i_partition) + 1; int step = (h->mb.i_partition == D_16x8) + 1; int width = 4 >> ((D_16x16 - h->mb.i_partition)&1); int height = 4 >> ((D_16x16 - h->mb.i_partition)>>1); /* col_zero_flag */ for( int i8 = 0; i8 < max_i8; i8 += step ) { const int x8 = i8&1; const int y8 = i8>>1; int ypart = (b_interlaced && h->fref[1][0]->field[mb_xy] != MB_INTERLACED) ? MB_INTERLACED ? y8*6 : 2*(h->mb.i_mb_y&1) + y8 : 3*y8; int o8 = x8 + (ypart>>1) * h->mb.i_b8_stride; int o4 = 3*x8 + ypart * h->mb.i_b4_stride; if( b_interlaced && IS_INTRA( type_col[y8] ) ) continue; int idx; if( l1ref0[o8] == 0 ) idx = 0; else if( l1ref0[o8] < 0 && l1ref1[o8] == 0 ) idx = 1; else continue; if( abs( l1mv[idx][o4][0] ) <= 1 && abs( l1mv[idx][o4][1] ) <= 1 ) { if( ref[0] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 0, 0 ); if( ref[1] == 0 ) x264_macroblock_cache_mv( h, 2*x8, 2*y8, width, height, 1, 0 ); } } return 1; } static int mb_predict_mv_direct16x16_spatial_interlaced( x264_t *h ) { return mb_predict_mv_direct16x16_spatial( h, 1 ); } static int mb_predict_mv_direct16x16_spatial_progressive( x264_t *h ) { return mb_predict_mv_direct16x16_spatial( h, 0 ); } int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed ) { int b_available; if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE ) return 0; else if( h->sh.b_direct_spatial_mv_pred ) { if( SLICE_MBAFF ) b_available = mb_predict_mv_direct16x16_spatial_interlaced( h ); else b_available = mb_predict_mv_direct16x16_spatial_progressive( h ); } else b_available = mb_predict_mv_direct16x16_temporal( h ); if( b_changed != NULL && b_available ) { int changed; changed = (int)(M32( h->mb.cache.direct_mv[0][0] ) ^ M32( h->mb.cache.mv[0][x264_scan8[0]] )); changed |= (int)(M32( h->mb.cache.direct_mv[1][0] ) ^ M32( h->mb.cache.mv[1][x264_scan8[0]] )); changed |= h->mb.cache.direct_ref[0][0] ^ h->mb.cache.ref[0][x264_scan8[0]]; changed |= h->mb.cache.direct_ref[1][0] ^ h->mb.cache.ref[1][x264_scan8[0]]; if( !changed && h->mb.i_partition != D_16x16 ) { changed |= (int)(M32( h->mb.cache.direct_mv[0][3] ) ^ M32( h->mb.cache.mv[0][x264_scan8[12]] )); changed |= (int)(M32( h->mb.cache.direct_mv[1][3] ) ^ M32( h->mb.cache.mv[1][x264_scan8[12]] )); changed |= h->mb.cache.direct_ref[0][3] ^ h->mb.cache.ref[0][x264_scan8[12]]; changed |= h->mb.cache.direct_ref[1][3] ^ h->mb.cache.ref[1][x264_scan8[12]]; } if( !changed && h->mb.i_partition == D_8x8 ) { changed |= (int)(M32( h->mb.cache.direct_mv[0][1] ) ^ M32( h->mb.cache.mv[0][x264_scan8[4]] )); changed |= (int)(M32( h->mb.cache.direct_mv[1][1] ) ^ M32( h->mb.cache.mv[1][x264_scan8[4]] )); changed |= (int)(M32( h->mb.cache.direct_mv[0][2] ) ^ M32( h->mb.cache.mv[0][x264_scan8[8]] )); changed |= (int)(M32( h->mb.cache.direct_mv[1][2] ) ^ M32( h->mb.cache.mv[1][x264_scan8[8]] )); changed |= h->mb.cache.direct_ref[0][1] ^ h->mb.cache.ref[0][x264_scan8[4]]; changed |= h->mb.cache.direct_ref[1][1] ^ h->mb.cache.ref[1][x264_scan8[4]]; changed |= h->mb.cache.direct_ref[0][2] ^ h->mb.cache.ref[0][x264_scan8[8]]; changed |= h->mb.cache.direct_ref[1][2] ^ h->mb.cache.ref[1][x264_scan8[8]]; } *b_changed = changed; if( !changed ) return b_available; } /* cache ref & mv */ if( b_available ) for( int l = 0; l < 2; l++ ) { CP32( h->mb.cache.direct_mv[l][0], h->mb.cache.mv[l][x264_scan8[ 0]] ); CP32( h->mb.cache.direct_mv[l][1], h->mb.cache.mv[l][x264_scan8[ 4]] ); CP32( h->mb.cache.direct_mv[l][2], h->mb.cache.mv[l][x264_scan8[ 8]] ); CP32( h->mb.cache.direct_mv[l][3], h->mb.cache.mv[l][x264_scan8[12]] ); h->mb.cache.direct_ref[l][0] = h->mb.cache.ref[l][x264_scan8[ 0]]; h->mb.cache.direct_ref[l][1] = h->mb.cache.ref[l][x264_scan8[ 4]]; h->mb.cache.direct_ref[l][2] = h->mb.cache.ref[l][x264_scan8[ 8]]; h->mb.cache.direct_ref[l][3] = h->mb.cache.ref[l][x264_scan8[12]]; h->mb.cache.direct_partition = h->mb.i_partition; } return b_available; } /* This just improves encoder performance, it's not part of the spec */ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t (*mvc)[2], int *i_mvc ) { int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref]; int i = 0; #define SET_MVP(mvp) \ { \ CP32( mvc[i], mvp ); \ i++; \ } #define SET_IMVP(xy) \ if( xy >= 0 ) \ { \ int shift = 1 + MB_INTERLACED - h->mb.field[xy]; \ int16_t *mvp = h->mb.mvr[i_list][i_ref<<1>>shift][xy]; \ mvc[i][0] = mvp[0]; \ mvc[i][1] = mvp[1]*2>>shift; \ i++; \ } /* b_direct */ if( h->sh.i_type == SLICE_TYPE_B && h->mb.cache.ref[i_list][x264_scan8[12]] == i_ref ) { SET_MVP( h->mb.cache.mv[i_list][x264_scan8[12]] ); } if( i_ref == 0 && h->frames.b_have_lowres ) { int idx = i_list ? h->fref[1][0]->i_frame-h->fenc->i_frame-1 : h->fenc->i_frame-h->fref[0][0]->i_frame-1; if( idx <= h->param.i_bframe ) { int16_t (*lowres_mv)[2] = h->fenc->lowres_mvs[i_list][idx]; if( lowres_mv[0][0] != 0x7fff ) { M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff; i++; } } } /* spatial predictors */ if( SLICE_MBAFF ) { SET_IMVP( h->mb.i_mb_left_xy[0] ); SET_IMVP( h->mb.i_mb_top_xy ); SET_IMVP( h->mb.i_mb_topleft_xy ); SET_IMVP( h->mb.i_mb_topright_xy ); } else { SET_MVP( mvr[h->mb.i_mb_left_xy[0]] ); SET_MVP( mvr[h->mb.i_mb_top_xy] ); SET_MVP( mvr[h->mb.i_mb_topleft_xy] ); SET_MVP( mvr[h->mb.i_mb_topright_xy] ); } #undef SET_IMVP #undef SET_MVP /* temporal predictors */ if( h->fref[0][0]->i_ref[0] > 0 ) { x264_frame_t *l0 = h->fref[0][0]; int field = h->mb.i_mb_y&1; int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; int refpoc = h->fref[i_list][i_ref>>SLICE_MBAFF]->i_poc; refpoc += l0->i_delta_poc[field^(i_ref&1)]; #define SET_TMVP( dx, dy ) \ { \ int mb_index = h->mb.i_mb_xy + dx + dy*h->mb.i_mb_stride; \ int scale = (curpoc - refpoc) * l0->inv_ref_poc[MB_INTERLACED&field]; \ mvc[i][0] = x264_clip3( (l0->mv16x16[mb_index][0]*scale + 128) >> 8, INT16_MIN, INT16_MAX ); \ mvc[i][1] = x264_clip3( (l0->mv16x16[mb_index][1]*scale + 128) >> 8, INT16_MIN, INT16_MAX ); \ i++; \ } SET_TMVP(0,0); if( h->mb.i_mb_x < h->mb.i_mb_width-1 ) SET_TMVP(1,0); if( h->mb.i_mb_y < h->mb.i_mb_height-1 ) SET_TMVP(0,1); #undef SET_TMVP } *i_mvc = i; } x264-master/common/opencl.c000066400000000000000000000622421502133446700160010ustar00rootroot00000000000000/***************************************************************************** * opencl.c: OpenCL initialization and kernel compilation ***************************************************************************** * Copyright (C) 2012-2025 x264 project * * Authors: Steve Borho * Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #ifdef _WIN32 #include #define ocl_open LoadLibraryW( L"OpenCL" ) #define ocl_close FreeLibrary #define ocl_address GetProcAddress #else #include //dlopen, dlsym, dlclose #if SYS_MACOSX #define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW ) #else #define ocl_open dlopen( "libOpenCL.so", RTLD_NOW ) #endif #define ocl_close dlclose #define ocl_address dlsym #endif #define LOAD_OCL_FUNC(name, continue_on_fail)\ {\ ocl->name = (void*)ocl_address( ocl->library, #name );\ if( !continue_on_fail && !ocl->name )\ goto fail;\ } /* load the library and functions we require from it */ x264_opencl_function_t *x264_opencl_load_library( void ) { x264_opencl_function_t *ocl; #undef fail #define fail fail0 CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) ); #undef fail #define fail fail1 ocl->library = ocl_open; if( !ocl->library ) goto fail; #undef fail #define fail fail2 LOAD_OCL_FUNC( clBuildProgram, 0 ); LOAD_OCL_FUNC( clCreateBuffer, 0 ); LOAD_OCL_FUNC( clCreateCommandQueue, 0 ); LOAD_OCL_FUNC( clCreateContext, 0 ); LOAD_OCL_FUNC( clCreateImage2D, 0 ); LOAD_OCL_FUNC( clCreateKernel, 0 ); LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 ); LOAD_OCL_FUNC( clCreateProgramWithSource, 0 ); LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 ); LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 ); LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 ); LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 ); LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 ); LOAD_OCL_FUNC( clFinish, 0 ); LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 ); LOAD_OCL_FUNC( clGetDeviceIDs, 0 ); LOAD_OCL_FUNC( clGetDeviceInfo, 0 ); LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 ); LOAD_OCL_FUNC( clGetPlatformIDs, 0 ); LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 ); LOAD_OCL_FUNC( clGetProgramInfo, 0 ); LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 ); LOAD_OCL_FUNC( clReleaseCommandQueue, 0 ); LOAD_OCL_FUNC( clReleaseContext, 0 ); LOAD_OCL_FUNC( clReleaseKernel, 0 ); LOAD_OCL_FUNC( clReleaseMemObject, 0 ); LOAD_OCL_FUNC( clReleaseProgram, 0 ); LOAD_OCL_FUNC( clSetKernelArg, 0 ); return ocl; #undef fail fail2: ocl_close( ocl->library ); fail1: x264_free( ocl ); fail0: return NULL; } void x264_opencl_close_library( x264_opencl_function_t *ocl ) { if( !ocl ) return; ocl_close( ocl->library ); x264_free( ocl ); } /* define from recent cl_ext.h, copied here in case headers are old */ #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 /* Requires full include path in case of out-of-tree builds */ #include "common/oclobj.h" static int detect_switchable_graphics( void ); /* Try to load the cached compiled program binary, verify the device context is * still valid before reuse */ static cl_program opencl_cache_load( x264_t *h, const char *dev_name, const char *dev_vendor, const char *driver_version ) { /* try to load cached program binary */ FILE *fp = x264_fopen( h->param.psz_clbin_file, "rb" ); if( !fp ) return NULL; x264_opencl_function_t *ocl = h->opencl.ocl; cl_program program = NULL; uint8_t *binary = NULL; fseek( fp, 0, SEEK_END ); int64_t file_size = ftell( fp ); fseek( fp, 0, SEEK_SET ); if( file_size < 0 || (uint64_t)file_size > SIZE_MAX ) goto fail; size_t size = file_size; CHECKED_MALLOC( binary, size ); if( fread( binary, 1, size, fp ) != size ) goto fail; const uint8_t *ptr = (const uint8_t*)binary; #define CHECK_STRING( STR )\ do {\ size_t len = strlen( STR );\ if( size <= len || strncmp( (char*)ptr, STR, len ) )\ goto fail;\ else {\ size -= (len+1); ptr += (len+1);\ }\ } while( 0 ) CHECK_STRING( dev_name ); CHECK_STRING( dev_vendor ); CHECK_STRING( driver_version ); CHECK_STRING( x264_opencl_source_hash ); #undef CHECK_STRING cl_int status; program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status ); if( status != CL_SUCCESS ) program = NULL; fail: fclose( fp ); x264_free( binary ); return program; } /* Save the compiled program binary to a file for later reuse. Device context * is also saved in the cache file so we do not reuse stale binaries */ static void opencl_cache_save( x264_t *h, cl_program program, const char *dev_name, const char *dev_vendor, const char *driver_version ) { FILE *fp = x264_fopen( h->param.psz_clbin_file, "wb" ); if( !fp ) { x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" ); return; } x264_opencl_function_t *ocl = h->opencl.ocl; uint8_t *binary = NULL; size_t size = 0; cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL ); if( status != CL_SUCCESS || !size ) { x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" ); goto fail; } CHECKED_MALLOC( binary, size ); status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL ); if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" ); goto fail; } fputs( dev_name, fp ); fputc( '\n', fp ); fputs( dev_vendor, fp ); fputc( '\n', fp ); fputs( driver_version, fp ); fputc( '\n', fp ); fputs( x264_opencl_source_hash, fp ); fputc( '\n', fp ); fwrite( binary, 1, size, fp ); fail: fclose( fp ); x264_free( binary ); return; } /* The OpenCL source under common/opencl will be merged into common/oclobj.h by * the Makefile. It defines a x264_opencl_source byte array which we will pass * to clCreateProgramWithSource(). We also attempt to use a cache file for the * compiled binary, stored in the current working folder. */ static cl_program opencl_compile( x264_t *h ) { x264_opencl_function_t *ocl = h->opencl.ocl; cl_program program = NULL; char *build_log = NULL; char dev_name[64]; char dev_vendor[64]; char driver_version[64]; cl_int status; status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL ); status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL ); status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL ); if( status != CL_SUCCESS ) return NULL; // Most AMD GPUs have vector registers int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." ); h->opencl.b_device_AMD_SI = 0; if( vectorize ) { /* Disable OpenCL on Intel/AMD switchable graphics devices */ if( detect_switchable_graphics() ) { x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" ); return NULL; } /* Detect AMD SouthernIsland or newer device (single-width registers) */ cl_uint simdwidth = 4; status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL ); if( status == CL_SUCCESS && simdwidth == 1 ) { vectorize = 0; h->opencl.b_device_AMD_SI = 1; } } x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" ); program = opencl_cache_load( h, dev_name, dev_vendor, driver_version ); if( !program ) { /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */ x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" ); const char *strptr = (const char*)x264_opencl_source; size_t size = sizeof(x264_opencl_source); program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status ); if( status != CL_SUCCESS || !program ) { x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" ); return NULL; } } /* Build the program binary for the OpenCL device */ const char *buildopts = vectorize ? "-DVECTORIZE=1" : ""; status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL ); if( status == CL_SUCCESS ) { opencl_cache_save( h, program, dev_name, dev_vendor, driver_version ); return program; } /* Compile failure, should not happen with production code. */ size_t build_log_len = 0; status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len ); if( status != CL_SUCCESS || !build_log_len ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" ); goto fail; } build_log = x264_malloc( build_log_len ); if( !build_log ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" ); goto fail; } status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL ); if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" ); goto fail; } FILE *log_file = x264_fopen( "x264_kernel_build_log.txt", "w" ); if( !log_file ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" ); goto fail; } fwrite( build_log, 1, build_log_len, log_file ); fclose( log_file ); x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" ); fail: x264_free( build_log ); if( program ) ocl->clReleaseProgram( program ); return NULL; } static int opencl_lookahead_alloc( x264_t *h ) { if( !h->param.rc.i_lookahead ) return -1; static const char *kernelnames[] = { "mb_intra_cost_satd_8x8", "sum_intra_cost", "downscale_hpel", "downscale1", "downscale2", "memset_int16", "weightp_scaled_images", "weightp_hpel", "hierarchical_motion", "subpel_refine", "mode_selection", "sum_inter_cost" }; cl_kernel *kernels[] = { &h->opencl.intra_kernel, &h->opencl.rowsum_intra_kernel, &h->opencl.downscale_hpel_kernel, &h->opencl.downscale_kernel1, &h->opencl.downscale_kernel2, &h->opencl.memset_kernel, &h->opencl.weightp_scaled_images_kernel, &h->opencl.weightp_hpel_kernel, &h->opencl.hme_kernel, &h->opencl.subpel_refine_kernel, &h->opencl.mode_select_kernel, &h->opencl.rowsum_inter_kernel }; x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status; h->opencl.lookahead_program = opencl_compile( h ); if( !h->opencl.lookahead_program ) goto fail; for( int i = 0; i < ARRAY_ELEMS(kernelnames); i++ ) { *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status ); if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status ); goto fail; } } h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status ); if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status ); goto fail; } h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status ); if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status ); goto fail; } return 0; fail: x264_opencl_lookahead_delete( h ); return -1; } static void CL_CALLBACK opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data ) { /* Any error notification can be assumed to be fatal to the OpenCL context. * We need to stop using it immediately to prevent further damage. */ x264_t *h = (x264_t*)user_data; h->param.b_opencl = 0; h->opencl.b_fatal_error = 1; x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo ); x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" ); } int x264_opencl_lookahead_init( x264_t *h ) { x264_opencl_function_t *ocl = h->opencl.ocl; cl_platform_id *platforms = NULL; cl_device_id *devices = NULL; cl_image_format *imageType = NULL; cl_context context = NULL; int ret = -1; cl_uint numPlatforms = 0; cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms ); if( status != CL_SUCCESS || !numPlatforms ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" ); goto fail; } platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms ); if( !platforms ) { x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" ); goto fail; } status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL ); if( status != CL_SUCCESS ) { x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" ); goto fail; } /* Select the first OpenCL platform with a GPU device that supports our * required image (texture) formats */ for( cl_uint i = 0; i < numPlatforms; i++ ) { cl_uint gpu_count = 0; status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count ); if( status != CL_SUCCESS || !gpu_count ) continue; x264_free( devices ); devices = x264_malloc( sizeof(cl_device_id) * gpu_count ); if( !devices ) continue; status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL ); if( status != CL_SUCCESS ) continue; /* Find a GPU device that supports our image formats */ for( cl_uint gpu = 0; gpu < gpu_count; gpu++ ) { h->opencl.device = devices[gpu]; /* if the user has specified an exact device ID, skip all other * GPUs. If this device matches, allow it to continue through the * checks for supported images, etc. */ if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id ) continue; cl_bool image_support = 0; status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL ); if( status != CL_SUCCESS || !image_support ) continue; if( context ) ocl->clReleaseContext( context ); context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)opencl_error_notify, (void*)h, &status ); if( status != CL_SUCCESS || !context ) continue; cl_uint imagecount = 0; status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount ); if( status != CL_SUCCESS || !imagecount ) continue; x264_free( imageType ); imageType = x264_malloc( sizeof(cl_image_format) * imagecount ); if( !imageType ) continue; status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL ); if( status != CL_SUCCESS ) continue; int b_has_r = 0; int b_has_rgba = 0; for( cl_uint j = 0; j < imagecount; j++ ) { if( imageType[j].image_channel_order == CL_R && imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 ) b_has_r = 1; else if( imageType[j].image_channel_order == CL_RGBA && imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 ) b_has_rgba = 1; } if( !b_has_r || !b_has_rgba ) { char dev_name[64]; status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL ); if( status == CL_SUCCESS ) { /* emit warning if we are discarding the user's explicit choice */ int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG; x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name ); } continue; } /* user selection of GPU device, skip N first matches */ if( h->param.i_opencl_device ) { h->param.i_opencl_device--; continue; } h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status ); if( status != CL_SUCCESS || !h->opencl.queue ) continue; h->opencl.context = context; context = NULL; ret = 0; break; } if( !ret ) break; } if( !h->param.psz_clbin_file ) h->param.psz_clbin_file = "x264_lookahead.clbin"; if( ret ) x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" ); else ret = opencl_lookahead_alloc( h ); fail: if( context ) ocl->clReleaseContext( context ); x264_free( imageType ); x264_free( devices ); x264_free( platforms ); return ret; } static void opencl_lookahead_free( x264_t *h ) { x264_opencl_function_t *ocl = h->opencl.ocl; #define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 ) RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel ); RELEASE( h->opencl.downscale_kernel1, clReleaseKernel ); RELEASE( h->opencl.downscale_kernel2, clReleaseKernel ); RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel ); RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel ); RELEASE( h->opencl.memset_kernel, clReleaseKernel ); RELEASE( h->opencl.intra_kernel, clReleaseKernel ); RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel ); RELEASE( h->opencl.hme_kernel, clReleaseKernel ); RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel ); RELEASE( h->opencl.mode_select_kernel, clReleaseKernel ); RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel ); RELEASE( h->opencl.lookahead_program, clReleaseProgram ); RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject ); RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject ); RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject ); for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject ); RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject ); RELEASE( h->opencl.row_satds[0], clReleaseMemObject ); RELEASE( h->opencl.row_satds[1], clReleaseMemObject ); RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject ); RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject ); RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject ); RELEASE( h->opencl.mvp_buffer, clReleaseMemObject ); RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject ); RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject ); RELEASE( h->opencl.frame_stats[0], clReleaseMemObject ); RELEASE( h->opencl.frame_stats[1], clReleaseMemObject ); #undef RELEASE } void x264_opencl_lookahead_delete( x264_t *h ) { x264_opencl_function_t *ocl = h->opencl.ocl; if( !ocl ) return; if( h->opencl.queue ) ocl->clFinish( h->opencl.queue ); opencl_lookahead_free( h ); if( h->opencl.queue ) { ocl->clReleaseCommandQueue( h->opencl.queue ); h->opencl.queue = NULL; } if( h->opencl.context ) { ocl->clReleaseContext( h->opencl.context ); h->opencl.context = NULL; } } void x264_opencl_frame_delete( x264_frame_t *frame ) { x264_opencl_function_t *ocl = frame->opencl.ocl; if( !ocl ) return; #define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 ) for( int j = 0; j < NUM_IMAGE_SCALES; j++ ) RELEASEBUF( frame->opencl.scaled_image2Ds[j] ); RELEASEBUF( frame->opencl.luma_hpel ); RELEASEBUF( frame->opencl.inv_qscale_factor ); RELEASEBUF( frame->opencl.intra_cost ); RELEASEBUF( frame->opencl.lowres_mvs0 ); RELEASEBUF( frame->opencl.lowres_mvs1 ); RELEASEBUF( frame->opencl.lowres_mv_costs0 ); RELEASEBUF( frame->opencl.lowres_mv_costs1 ); #undef RELEASEBUF } /* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so * we consult AMD's ADL interface to detect this situation and disable * OpenCL on these machines (Linux and Windows) */ #ifdef _WIN32 #define ADL_API_CALL #define ADL_CALLBACK __stdcall #define adl_close FreeLibrary #define adl_address GetProcAddress #else #define ADL_API_CALL #define ADL_CALLBACK #define adl_close dlclose #define adl_address dlsym #endif typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int ); typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int ); typedef int ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * ); typedef int ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * ); typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void ); #define ADL_OK 0 #define ADL_PX_SCHEME_DYNAMIC 2 static void* ADL_CALLBACK adl_malloc_wrapper( int iSize ) { return x264_malloc( iSize ); } static int detect_switchable_graphics( void ) { void *hDLL; ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create; ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get; ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get; ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy; int ret = 0; #ifdef _WIN32 hDLL = LoadLibraryW( L"atiadlxx.dll" ); if( !hDLL ) hDLL = LoadLibraryW( L"atiadlxy.dll" ); #else hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL ); #endif if( !hDLL ) goto fail0; ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create"); ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy"); ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get"); ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get"); if( !ADL_Main_Control_Create || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get || !ADL_PowerXpress_Scheme_Get ) goto fail1; if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) ) goto fail1; int numAdapters = 0; if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) ) goto fail2; for( int i = 0; i < numAdapters; i++ ) { int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState; if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) ) break; if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC ) { ret = 1; break; } } fail2: ADL_Main_Control_Destroy(); fail1: adl_close( hDLL ); fail0: return ret; } x264-master/common/opencl.h000066400000000000000000000722261502133446700160110ustar00rootroot00000000000000/***************************************************************************** * opencl.h: OpenCL structures and defines ***************************************************************************** * Copyright (C) 2012-2025 x264 project * * Authors: Steve Borho * Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_OPENCL_H #define X264_OPENCL_H #define CL_USE_DEPRECATED_OPENCL_1_1_APIS #include "extras/cl.h" #define OCL_API(ret, attr, name) typedef ret (attr *name##_func) /* Platform API */ OCL_API(cl_int, CL_API_CALL, clGetPlatformIDs) ( cl_uint /* num_entries */, cl_platform_id * /* platforms */, cl_uint * /* num_platforms */); OCL_API(cl_int, CL_API_CALL, clGetPlatformInfo) ( cl_platform_id /* platform */, cl_platform_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Device APIs */ OCL_API(cl_int, CL_API_CALL, clGetDeviceIDs) ( cl_platform_id /* platform */, cl_device_type /* device_type */, cl_uint /* num_entries */, cl_device_id * /* devices */, cl_uint * /* num_devices */); OCL_API(cl_int, CL_API_CALL, clGetDeviceInfo) ( cl_device_id /* device */, cl_device_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_int, CL_API_CALL, clCreateSubDevices) ( cl_device_id /* in_device */, const cl_device_partition_property * /* properties */, cl_uint /* num_devices */, cl_device_id * /* out_devices */, cl_uint * /* num_devices_ret */); OCL_API(cl_int, CL_API_CALL, clRetainDevice) ( cl_device_id /* device */); OCL_API(cl_int, CL_API_CALL, clReleaseDevice) ( cl_device_id /* device */); /* Context APIs */ OCL_API(cl_context, CL_API_CALL, clCreateContext) ( const cl_context_properties * /* properties */, cl_uint /* num_devices */, const cl_device_id * /* devices */, void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), void * /* user_data */, cl_int * /* errcode_ret */); OCL_API(cl_context, CL_API_CALL, clCreateContextFromType) ( const cl_context_properties * /* properties */, cl_device_type /* device_type */, void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), void * /* user_data */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clRetainContext) ( cl_context /* context */); OCL_API(cl_int, CL_API_CALL, clReleaseContext) ( cl_context /* context */); OCL_API(cl_int, CL_API_CALL, clGetContextInfo) ( cl_context /* context */, cl_context_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Command Queue APIs */ OCL_API(cl_command_queue, CL_API_CALL, clCreateCommandQueue) ( cl_context /* context */, cl_device_id /* device */, cl_command_queue_properties /* properties */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clRetainCommandQueue) ( cl_command_queue /* command_queue */); OCL_API(cl_int, CL_API_CALL, clReleaseCommandQueue) ( cl_command_queue /* command_queue */); OCL_API(cl_int, CL_API_CALL, clGetCommandQueueInfo) ( cl_command_queue /* command_queue */, cl_command_queue_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Memory Object APIs */ OCL_API(cl_mem, CL_API_CALL, clCreateBuffer) ( cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */, void * /* host_ptr */, cl_int * /* errcode_ret */); OCL_API(cl_mem, CL_API_CALL, clCreateSubBuffer) ( cl_mem /* buffer */, cl_mem_flags /* flags */, cl_buffer_create_type /* buffer_create_type */, const void * /* buffer_create_info */, cl_int * /* errcode_ret */); OCL_API(cl_mem, CL_API_CALL, clCreateImage) ( cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, const cl_image_desc * /* image_desc */, void * /* host_ptr */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clRetainMemObject) ( cl_mem /* memobj */); OCL_API(cl_int, CL_API_CALL, clReleaseMemObject) ( cl_mem /* memobj */); OCL_API(cl_int, CL_API_CALL, clGetSupportedImageFormats) ( cl_context /* context */, cl_mem_flags /* flags */, cl_mem_object_type /* image_type */, cl_uint /* num_entries */, cl_image_format * /* image_formats */, cl_uint * /* num_image_formats */); OCL_API(cl_int, CL_API_CALL, clGetMemObjectInfo) ( cl_mem /* memobj */, cl_mem_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_int, CL_API_CALL, clGetImageInfo) ( cl_mem /* image */, cl_image_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_int, CL_API_CALL, clSetMemObjectDestructorCallback) ( cl_mem /* memobj */, void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), void * /*user_data */ ); /* Sampler APIs */ OCL_API(cl_sampler, CL_API_CALL, clCreateSampler) ( cl_context /* context */, cl_bool /* normalized_coords */, cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clRetainSampler) ( cl_sampler /* sampler */); OCL_API(cl_int, CL_API_CALL, clReleaseSampler) ( cl_sampler /* sampler */); OCL_API(cl_int, CL_API_CALL, clGetSamplerInfo) ( cl_sampler /* sampler */, cl_sampler_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Program Object APIs */ OCL_API(cl_program, CL_API_CALL, clCreateProgramWithSource) ( cl_context /* context */, cl_uint /* count */, const char ** /* strings */, const size_t * /* lengths */, cl_int * /* errcode_ret */); OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBinary) ( cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const size_t * /* lengths */, const unsigned char ** /* binaries */, cl_int * /* binary_status */, cl_int * /* errcode_ret */); OCL_API(cl_program, CL_API_CALL, clCreateProgramWithBuiltInKernels) ( cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* kernel_names */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clRetainProgram) ( cl_program /* program */); OCL_API(cl_int, CL_API_CALL, clReleaseProgram) ( cl_program /* program */); OCL_API(cl_int, CL_API_CALL, clBuildProgram) ( cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* options */, void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), void * /* user_data */); OCL_API(cl_int, CL_API_CALL, clCompileProgram) ( cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* options */, cl_uint /* num_input_headers */, const cl_program * /* input_headers */, const char ** /* header_include_names */, void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), void * /* user_data */); OCL_API(cl_program, CL_API_CALL, clLinkProgram) ( cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* options */, cl_uint /* num_input_programs */, const cl_program * /* input_programs */, void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), void * /* user_data */, cl_int * /* errcode_ret */ ); OCL_API(cl_int, CL_API_CALL, clUnloadPlatformCompiler) ( cl_platform_id /* platform */); OCL_API(cl_int, CL_API_CALL, clGetProgramInfo) ( cl_program /* program */, cl_program_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_int, CL_API_CALL, clGetProgramBuildInfo) ( cl_program /* program */, cl_device_id /* device */, cl_program_build_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Kernel Object APIs */ OCL_API(cl_kernel, CL_API_CALL, clCreateKernel) ( cl_program /* program */, const char * /* kernel_name */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clCreateKernelsInProgram) ( cl_program /* program */, cl_uint /* num_kernels */, cl_kernel * /* kernels */, cl_uint * /* num_kernels_ret */); OCL_API(cl_int, CL_API_CALL, clRetainKernel) ( cl_kernel /* kernel */); OCL_API(cl_int, CL_API_CALL, clReleaseKernel) ( cl_kernel /* kernel */); OCL_API(cl_int, CL_API_CALL, clSetKernelArg) ( cl_kernel /* kernel */, cl_uint /* arg_index */, size_t /* arg_size */, const void * /* arg_value */); OCL_API(cl_int, CL_API_CALL, clGetKernelInfo) ( cl_kernel /* kernel */, cl_kernel_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_int, CL_API_CALL, clGetKernelArgInfo) ( cl_kernel /* kernel */, cl_uint /* arg_indx */, cl_kernel_arg_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_int, CL_API_CALL, clGetKernelWorkGroupInfo) ( cl_kernel /* kernel */, cl_device_id /* device */, cl_kernel_work_group_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Event Object APIs */ OCL_API(cl_int, CL_API_CALL, clWaitForEvents) ( cl_uint /* num_events */, const cl_event * /* event_list */); OCL_API(cl_int, CL_API_CALL, clGetEventInfo) ( cl_event /* event */, cl_event_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); OCL_API(cl_event, CL_API_CALL, clCreateUserEvent) ( cl_context /* context */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clRetainEvent) ( cl_event /* event */); OCL_API(cl_int, CL_API_CALL, clReleaseEvent) ( cl_event /* event */); OCL_API(cl_int, CL_API_CALL, clSetUserEventStatus) ( cl_event /* event */, cl_int /* execution_status */); OCL_API(cl_int, CL_API_CALL, clSetEventCallback) ( cl_event /* event */, cl_int /* command_exec_callback_type */, void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), void * /* user_data */); /* Profiling APIs */ OCL_API(cl_int, CL_API_CALL, clGetEventProfilingInfo) ( cl_event /* event */, cl_profiling_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */); /* Flush and Finish APIs */ OCL_API(cl_int, CL_API_CALL, clFlush) ( cl_command_queue /* command_queue */); OCL_API(cl_int, CL_API_CALL, clFinish) ( cl_command_queue /* command_queue */); /* Enqueued Commands APIs */ OCL_API(cl_int, CL_API_CALL, clEnqueueReadBuffer) ( cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */, size_t /* offset */, size_t /* size */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueReadBufferRect) ( cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */, const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */, size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */, size_t /* host_slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBuffer) ( cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */, size_t /* offset */, size_t /* size */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueWriteBufferRect) ( cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */, const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */, size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */, size_t /* host_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueFillBuffer) ( cl_command_queue /* command_queue */, cl_mem /* buffer */, const void * /* pattern */, size_t /* pattern_size */, size_t /* offset */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBuffer) ( cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */, size_t /* src_offset */, size_t /* dst_offset */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferRect) ( cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */, const size_t * /* src_origin */, const size_t * /* dst_origin */, const size_t * /* region */, size_t /* src_row_pitch */, size_t /* src_slice_pitch */, size_t /* dst_row_pitch */, size_t /* dst_slice_pitch */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueReadImage) ( cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_read */, const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* row_pitch */, size_t /* slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueWriteImage) ( cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_write */, const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* input_row_pitch */, size_t /* input_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueFillImage) ( cl_command_queue /* command_queue */, cl_mem /* image */, const void * /* fill_color */, const size_t * /* origin[3] */, const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImage) ( cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_image */, const size_t * /* src_origin[3] */, const size_t * /* dst_origin[3] */, const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueCopyImageToBuffer) ( cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_buffer */, const size_t * /* src_origin[3] */, const size_t * /* region[3] */, size_t /* dst_offset */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueCopyBufferToImage) ( cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_image */, size_t /* src_offset */, const size_t * /* dst_origin[3] */, const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(void *, CL_API_CALL, clEnqueueMapBuffer) ( cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_map */, cl_map_flags /* map_flags */, size_t /* offset */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */, cl_int * /* errcode_ret */); OCL_API(void *, CL_API_CALL, clEnqueueMapImage) ( cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_map */, cl_map_flags /* map_flags */, const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clEnqueueUnmapMemObject) ( cl_command_queue /* command_queue */, cl_mem /* memobj */, void * /* mapped_ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueMigrateMemObjects) ( cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */, const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueNDRangeKernel) ( cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* work_dim */, const size_t * /* global_work_offset */, const size_t * /* global_work_size */, const size_t * /* local_work_size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueTask) ( cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueNativeKernel) ( cl_command_queue /* command_queue */, void (CL_CALLBACK * /*user_func*/)(void *), void * /* args */, size_t /* cb_args */, cl_uint /* num_mem_objects */, const cl_mem * /* mem_list */, const void ** /* args_mem_loc */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueMarkerWithWaitList) ( cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueBarrierWithWaitList) ( cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */); /* Extension function access * * Returns the extension function address for the given function name, * or NULL if a valid function can not be found. The client must * check to make sure the address is not NULL, before using or * calling the returned function address. */ OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddressForPlatform) ( cl_platform_id /* platform */, const char * /* func_name */); // Deprecated OpenCL 1.1 APIs OCL_API(cl_mem, CL_API_CALL, clCreateImage2D) ( cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, size_t /* image_width */, size_t /* image_height */, size_t /* image_row_pitch */, void * /* host_ptr */, cl_int * /* errcode_ret */); OCL_API(cl_mem, CL_API_CALL, clCreateImage3D) ( cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, size_t /* image_width */, size_t /* image_height */, size_t /* image_depth */, size_t /* image_row_pitch */, size_t /* image_slice_pitch */, void * /* host_ptr */, cl_int * /* errcode_ret */); OCL_API(cl_int, CL_API_CALL, clEnqueueMarker) ( cl_command_queue /* command_queue */, cl_event * /* event */); OCL_API(cl_int, CL_API_CALL, clEnqueueWaitForEvents) ( cl_command_queue /* command_queue */, cl_uint /* num_events */, const cl_event * /* event_list */); OCL_API(cl_int, CL_API_CALL, clEnqueueBarrier) ( cl_command_queue /* command_queue */); OCL_API(cl_int, CL_API_CALL, clUnloadCompiler) ( void); OCL_API(void *, CL_API_CALL, clGetExtensionFunctionAddress) ( const char * /* func_name */); #define OCL_DECLARE_FUNC(name) name##_func name typedef struct { void *library; OCL_DECLARE_FUNC( clBuildProgram ); OCL_DECLARE_FUNC( clCreateBuffer ); OCL_DECLARE_FUNC( clCreateCommandQueue ); OCL_DECLARE_FUNC( clCreateContext ); OCL_DECLARE_FUNC( clCreateImage2D ); OCL_DECLARE_FUNC( clCreateKernel ); OCL_DECLARE_FUNC( clCreateProgramWithBinary ); OCL_DECLARE_FUNC( clCreateProgramWithSource ); OCL_DECLARE_FUNC( clEnqueueCopyBuffer ); OCL_DECLARE_FUNC( clEnqueueMapBuffer ); OCL_DECLARE_FUNC( clEnqueueNDRangeKernel ); OCL_DECLARE_FUNC( clEnqueueReadBuffer ); OCL_DECLARE_FUNC( clEnqueueWriteBuffer ); OCL_DECLARE_FUNC( clFinish ); OCL_DECLARE_FUNC( clGetCommandQueueInfo ); OCL_DECLARE_FUNC( clGetDeviceIDs ); OCL_DECLARE_FUNC( clGetDeviceInfo ); OCL_DECLARE_FUNC( clGetKernelWorkGroupInfo ); OCL_DECLARE_FUNC( clGetPlatformIDs ); OCL_DECLARE_FUNC( clGetProgramBuildInfo ); OCL_DECLARE_FUNC( clGetProgramInfo ); OCL_DECLARE_FUNC( clGetSupportedImageFormats ); OCL_DECLARE_FUNC( clReleaseCommandQueue ); OCL_DECLARE_FUNC( clReleaseContext ); OCL_DECLARE_FUNC( clReleaseKernel ); OCL_DECLARE_FUNC( clReleaseMemObject ); OCL_DECLARE_FUNC( clReleaseProgram ); OCL_DECLARE_FUNC( clSetKernelArg ); } x264_opencl_function_t; /* Number of downscale resolutions to use for motion search */ #define NUM_IMAGE_SCALES 4 /* Number of PCIe copies that can be queued before requiring a flush */ #define MAX_FINISH_COPIES 1024 /* Size (in bytes) of the page-locked buffer used for PCIe xfers */ #define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024 typedef struct { x264_opencl_function_t *ocl; cl_context context; cl_device_id device; cl_command_queue queue; cl_program lookahead_program; cl_int last_buf; cl_mem page_locked_buffer; char *page_locked_ptr; int pl_occupancy; struct { void *src; void *dest; int bytes; } copies[MAX_FINISH_COPIES]; int num_copies; int b_device_AMD_SI; int b_fatal_error; int lookahead_thread_pri; int opencl_thread_pri; /* downscale lowres luma */ cl_kernel downscale_hpel_kernel; cl_kernel downscale_kernel1; cl_kernel downscale_kernel2; cl_mem luma_16x16_image[2]; /* weightp filtering */ cl_kernel weightp_hpel_kernel; cl_kernel weightp_scaled_images_kernel; cl_mem weighted_scaled_images[NUM_IMAGE_SCALES]; cl_mem weighted_luma_hpel; /* intra */ cl_kernel memset_kernel; cl_kernel intra_kernel; cl_kernel rowsum_intra_kernel; cl_mem row_satds[2]; /* hierarchical motion estimation */ cl_kernel hme_kernel; cl_kernel subpel_refine_kernel; cl_mem mv_buffers[2]; cl_mem lowres_mv_costs; cl_mem mvp_buffer; /* bidir */ cl_kernel mode_select_kernel; cl_kernel rowsum_inter_kernel; cl_mem lowres_costs[2]; cl_mem frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */ } x264_opencl_t; typedef struct { x264_opencl_function_t *ocl; cl_mem scaled_image2Ds[NUM_IMAGE_SCALES]; cl_mem luma_hpel; cl_mem inv_qscale_factor; cl_mem intra_cost; cl_mem lowres_mvs0; cl_mem lowres_mvs1; cl_mem lowres_mv_costs0; cl_mem lowres_mv_costs1; } x264_frame_opencl_t; typedef struct x264_frame x264_frame; #define x264_opencl_load_library x264_template(opencl_load_library) x264_opencl_function_t *x264_opencl_load_library( void ); #define x264_opencl_close_library x264_template(opencl_close_library) void x264_opencl_close_library( x264_opencl_function_t *ocl ); #define x264_opencl_lookahead_init x264_template(opencl_lookahead_init) int x264_opencl_lookahead_init( x264_t *h ); #define x264_opencl_lookahead_delete x264_template(opencl_lookahead_delete) void x264_opencl_lookahead_delete( x264_t *h ); #define x264_opencl_frame_delete x264_template(opencl_frame_delete) void x264_opencl_frame_delete( x264_frame *frame ); #endif x264-master/common/opencl/000077500000000000000000000000001502133446700156275ustar00rootroot00000000000000x264-master/common/opencl/bidir.cl000066400000000000000000000236451502133446700172520ustar00rootroot00000000000000/* Mode selection routines, select the least SATD cost mode for each lowres * macroblock. When measuring B slices, this includes measuring the cost of * three bidir modes. */ /* Four threads cooperatively measure 8x8 BIDIR cost with SATD */ int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres, int2 fencpos, read_only image2d_t fref0_planes, int2 qpos0, read_only image2d_t fref1_planes, int2 qpos1, int weight, local sum2_t *tmpp, int idx ) { volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp; sum2_t b0, b1, b2, b3; sum2_t sum = 0; // fencpos is full-pel position of original MB // qpos0 is qpel position within reference frame 0 // qpos1 is qpel position within reference frame 1 int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2); int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2); int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1)); int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2); int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2); int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2); int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2); int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1)); int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2); int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2); uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B; uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B; uint vA, vB; uint enc, ref0, ref1; uint a0, a1; const int weight2 = 64 - weight; #define READ_BIDIR_DIFF( OUT, X )\ enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\ vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\ vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\ ref0 = rhadd( vA, vB );\ vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\ vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\ ref1 = rhadd( vA, vB );\ OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6); #define READ_DIFF_EX( OUT, a, b )\ READ_BIDIR_DIFF( a0, a );\ READ_BIDIR_DIFF( a1, b );\ OUT = a0 + (a1<>BITS_PER_SUM)) >> 1; } /* * mode selection - pick the least cost partition type for each 8x8 macroblock. * Intra, list0 or list1. When measuring a B slice, also test three bidir * possibilities. * * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that * hold many frames worth of motion vectors. We must offset into the correct * location for this frame's vectors: * * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1] * GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count] * * global launch dimensions for P slice estimate: [mb_width, mb_height] * global launch dimensions for B slice estimate: [mb_width * 4, mb_height] */ kernel void mode_selection( read_only image2d_t fenc_lowres, read_only image2d_t fref0_planes, read_only image2d_t fref1_planes, const global short2 *fenc_lowres_mvs0, const global short2 *fenc_lowres_mvs1, const global short2 *fref1_lowres_mvs0, const global int16_t *fenc_lowres_mv_costs0, const global int16_t *fenc_lowres_mv_costs1, const global uint16_t *fenc_intra_cost, global uint16_t *lowres_costs, global int *frame_stats, local int16_t *cost_local, local sum2_t *satd_local, int mb_width, int bipred_weight, int dist_scale_factor, int b, int p0, int p1, int lambda ) { int mb_x = get_global_id( 0 ); int b_bidir = b < p1; if( b_bidir ) { /* when mode_selection is run for B frames, it must perform BIDIR SATD * measurements, so it is launched with four times as many threads in * order to spread the work around more of the GPU. And it can add * padding threads in the X direction. */ mb_x >>= 2; if( mb_x >= mb_width ) return; } int mb_y = get_global_id( 1 ); int mb_height = get_global_size( 1 ); int mb_count = mb_width * mb_height; int mb_xy = mb_x + mb_y * mb_width; /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */ if( mb_x < 4 && mb_y == 0 ) frame_stats[mb_x] = 0; int bcost = COST_MAX; int list_used = 0; if( !b_bidir ) { int icost = fenc_intra_cost[mb_xy]; COPY2_IF_LT( bcost, icost, list_used, 0 ); } if( b != p0 ) { int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy]; COPY2_IF_LT( bcost, mv_cost0, list_used, 1 ); } if( b != p1 ) { int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy]; COPY2_IF_LT( bcost, mv_cost1, list_used, 2 ); } if( b_bidir ) { int2 coord = (int2)(mb_x, mb_y) << 3; int mb_i = get_global_id( 0 ) & 3; int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); cost_local += mb_in_group * 4; satd_local += mb_in_group * 16; #define TRY_BIDIR( mv0, mv1, penalty )\ {\ int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\ int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\ cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\ int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\ COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\ } /* temporal prediction */ short2 dmv0, dmv1; short2 mvr = fref1_lowres_mvs0[mb_xy]; dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8; dmv1 = dmv0 - mvr; TRY_BIDIR( dmv0, dmv1, 0 ) if( as_uint( dmv0 ) || as_uint( dmv1 ) ) { /* B-direct prediction */ dmv0 = 0; dmv1 = 0; TRY_BIDIR( dmv0, dmv1, 0 ); } /* L0+L1 prediction */ dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy]; dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy]; TRY_BIDIR( dmv0, dmv1, 5 ); #undef TRY_BIDIR } lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); } /* * parallel sum inter costs * * global launch dimensions: [256, mb_height] */ kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs, const global uint16_t *inv_qscale_factor, global int *fenc_row_satds, global int *frame_stats, int mb_width, int bframe_bias, int b, int p0, int p1 ) { int y = get_global_id( 1 ); int mb_height = get_global_size( 1 ); int row_satds = 0; int cost_est = 0; int cost_est_aq = 0; int intra_mbs = 0; for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 )) { int mb_xy = x + y * mb_width; int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK; int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT; int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2; if( list == 0 && b_frame_score_mb ) intra_mbs++; int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8; row_satds += cost_aq; if( b_frame_score_mb ) { cost_est += cost; cost_est_aq += cost_aq; } } local int buffer[256]; int x = get_global_id( 0 ); row_satds = parallel_sum( row_satds, x, buffer ); cost_est = parallel_sum( cost_est, x, buffer ); cost_est_aq = parallel_sum( cost_est_aq, x, buffer ); intra_mbs = parallel_sum( intra_mbs, x, buffer ); if( b != p1 ) // Use floating point math to avoid 32bit integer overflow conditions cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias)); if( get_global_id( 0 ) == 0 ) { fenc_row_satds[y] = row_satds; atomic_add( frame_stats + COST_EST, cost_est ); atomic_add( frame_stats + COST_EST_AQ, cost_est_aq ); atomic_add( frame_stats + INTRA_MBS, intra_mbs ); } } x264-master/common/opencl/downscale.cl000066400000000000000000000124451502133446700201340ustar00rootroot00000000000000/* * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image * * -- * * fenc_img is an output image (area of memory referenced through a texture * cache). A read of any pixel location (x,y) returns four pixel values: * * val.s0 = P(x,y) * val.s1 = P(x+1,y) * val.s2 = P(x+2,y) * val.s3 = P(x+3,y) * * This is a 4x replication of the lowres pixels, a trade-off between memory * size and read latency. * * -- * * hpel_planes is an output image that contains the four HPEL planes used for * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with * the four planar values C | V | H | F * * launch dimensions: [lowres-width, lowres-height] */ kernel void downscale_hpel( const global pixel *fenc, write_only image2d_t fenc_img, write_only image2d_t hpel_planes, int stride ) { int x = get_global_id( 0 ); int y = get_global_id( 1 ); uint4 values; fenc += y * stride * 2; const global pixel *src1 = fenc + stride; const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride; int2 pos = (int2)(x, y); pixel right, left; right = rhadd( fenc[x*2], src1[x*2] ); left = rhadd( fenc[x*2+1], src1[x*2+1] ); values.s0 = rhadd( right, left ); // F right = rhadd( fenc[2*x+1], src1[2*x+1] ); left = rhadd( fenc[2*x+2], src1[2*x+2] ); values.s1 = rhadd( right, left ); // H right = rhadd( src1[2*x], src2[2*x] ); left = rhadd( src1[2*x+1], src2[2*x+1] ); values.s2 = rhadd( right, left ); // V right = rhadd( src1[2*x+1], src2[2*x+1] ); left = rhadd( src1[2*x+2], src2[2*x+2] ); values.s3 = rhadd( right, left ); // C uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff); write_imageui( hpel_planes, pos, val ); x = select( x, x+1, x+1 < get_global_size( 0 ) ); right = rhadd( fenc[x*2], src1[x*2] ); left = rhadd( fenc[x*2+1], src1[x*2+1] ); values.s1 = rhadd( right, left ); x = select( x, x+1, x+1 < get_global_size( 0 ) ); right = rhadd( fenc[x*2], src1[x*2] ); left = rhadd( fenc[x*2+1], src1[x*2+1] ); values.s2 = rhadd( right, left ); x = select( x, x+1, x+1 < get_global_size( 0 ) ); right = rhadd( fenc[x*2], src1[x*2] ); left = rhadd( fenc[x*2+1], src1[x*2+1] ); values.s3 = rhadd( right, left ); write_imageui( fenc_img, pos, values ); } /* * downscale lowres hierarchical motion search image, copy from one image to * another decimated image. This kernel is called iteratively to generate all * of the downscales. * * launch dimensions: [lower_res width, lower_res height] */ kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res ) { int x = get_global_id( 0 ); int y = get_global_id( 1 ); int2 pos = (int2)(x, y); int gs = get_global_size( 0 ); uint4 top, bot, values; top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) ); bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) ); values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ); /* these select statements appear redundant, and they should be, but tests break when * they are not here. I believe this was caused by a driver bug */ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) ); top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) ); bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) ); values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) ); values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) ); write_imageui( lower_res, pos, (uint4)(values) ); } /* * Second copy of downscale kernel, no differences. This is a (no perf loss) * workaround for a scheduling bug in current Tahiti drivers. This bug has * theoretically been fixed in the July 2012 driver release from AMD. */ kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res ) { int x = get_global_id( 0 ); int y = get_global_id( 1 ); int2 pos = (int2)(x, y); int gs = get_global_size( 0 ); uint4 top, bot, values; top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) ); bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) ); values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ); // see comment in above function copy values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) ); top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) ); bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) ); values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) ); values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) ); write_imageui( lower_res, pos, (uint4)(values) ); } /* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */ kernel void memset_int16( global int16_t *buf, int16_t value ) { buf[get_global_id( 0 )] = value; } x264-master/common/opencl/intra.cl000066400000000000000000001363421502133446700172750ustar00rootroot00000000000000/* Lookahead lowres intra analysis * * Each intra analysis function has been implemented twice, once for scalar GPUs * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects * the GPU type and sets the -DVECTORIZE compile flag accordingly. * * All the intra analysis functions were based on their C versions in pixel.c * and produce the exact same results. */ /* force all clamp arguments and return value to int, prevent ambiguous types */ #define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) ) #if VECTORIZE int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 ) { int8 a_v, d_v; int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13; int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33; d_v = convert_int8( vload8( 0, data ) ); a_v.s01234567 = (d_v - pr0).s04152637; HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); data += data_stride; d_v = convert_int8( vload8( 0, data ) ); a_v.s01234567 = (d_v - pr1).s04152637; HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); data += data_stride; d_v = convert_int8( vload8( 0, data ) ); a_v.s01234567 = (d_v - pr2).s04152637; HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); data += data_stride; d_v = convert_int8( vload8( 0, data ) ); a_v.s01234567 = (d_v - pr3).s04152637; HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi ); uint8 sum_v; HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 ); sum_v = abs( a_v ); HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 ); sum_v += abs( a_v ); HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 ); sum_v += abs( a_v ); HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 ); sum_v += abs( a_v ); uint4 sum2 = sum_v.hi + sum_v.lo; uint2 sum3 = sum2.hi + sum2.lo; return ( sum3.hi + sum3.lo ) >> 1; } #else SATD_C_8x4_Q( satd_8x4_lp, const local, private ) #endif /**************************************************************************** * 8x8 prediction for intra luma block ****************************************************************************/ #define F1 rhadd #define F2( a, b, c ) ( a+2*b+c+2 )>>2 #if VECTORIZE int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top ) { int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2; pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2; pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2; pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2; pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2; pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); // Lower half of pred[] pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2; pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2; pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2; pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2; pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2; pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2; pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2; pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2; pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2; pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2; pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2; pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2; return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) { int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr3.s0 = F2( left[1], left[2], left[3] ); pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] ); pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top ); pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] ); pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] ); pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] ); pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] ); pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] ); pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] ); pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] ); pr0.s7 = F2( top[5], top[6], top[7] ); int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); // Lower half of pred[] pr3.s0 = F2( left[5], left[6], left[7] ); pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] ); pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] ); pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] ); pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] ); pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] ); pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top ); pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] ); pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] ); pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] ); pr0.s7 = F2( top[1], top[2], top[3] ); return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) { int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr2.s0 = F2( left[1], left[0], left_top ); pr3.s0 = F2( left[2], left[1], left[0] ); pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] ); pr0.s0 = pr2.s1 = F1( left_top, top[0] ); pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] ); pr0.s1 = pr2.s2 = F1( top[0], top[1] ); pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] ); pr0.s2 = pr2.s3 = F1( top[1], top[2] ); pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] ); pr0.s3 = pr2.s4 = F1( top[2], top[3] ); pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] ); pr0.s4 = pr2.s5 = F1( top[3], top[4] ); pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] ); pr0.s5 = pr2.s6 = F1( top[4], top[5] ); pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] ); pr0.s6 = pr2.s7 = F1( top[5], top[6] ); pr1.s7 = F2( top[5], top[6], top[7] ); pr0.s7 = F1( top[6], top[7] ); int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); // Lower half of pred[] pr2.s0 = F2( left[5], left[4], left[3] ); pr3.s0 = F2( left[6], left[5], left[4] ); pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] ); pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] ); pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top ); pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] ); pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] ); pr0.s2 = pr2.s3 = F1( left_top, top[0] ); pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] ); pr0.s3 = pr2.s4 = F1( top[0], top[1] ); pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] ); pr0.s4 = pr2.s5 = F1( top[1], top[2] ); pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] ); pr0.s5 = pr2.s6 = F1( top[2], top[3] ); pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] ); pr0.s6 = pr2.s7 = F1( top[3], top[4] ); pr1.s7 = F2( top[3], top[4], top[5] ); pr0.s7 = F1( top[4], top[5] ); return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); #undef PRED } int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) { int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2; pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] ); pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] ); pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] ); pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2; pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2; pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] ); pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] ); pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2; pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2; pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] ); pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2; pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2; int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); // Lower half of pred[] pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2; pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top ) { int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr0.s0 = F1( top[0], top[1] ); pr1.s0 = F2( top[0], top[1], top[2] ); pr2.s0 = pr0.s1 = F1( top[1], top[2] ); pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] ); pr2.s1 = pr0.s2 = F1( top[2], top[3] ); pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] ); pr2.s2 = pr0.s3 = F1( top[3], top[4] ); pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] ); pr2.s3 = pr0.s4 = F1( top[4], top[5] ); pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] ); pr2.s4 = pr0.s5 = F1( top[5], top[6] ); pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] ); pr2.s5 = pr0.s6 = F1( top[6], top[7] ); pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] ); pr2.s6 = pr0.s7 = F1( top[7], top[8] ); pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] ); pr2.s7 = F1( top[8], top[9] ); pr3.s7 = F2( top[8], top[9], top[10] ); int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); // Lower half of pred[] pr0.s0 = F1( top[2], top[3] ); pr1.s0 = F2( top[2], top[3], top[4] ); pr2.s0 = pr0.s1 = F1( top[3], top[4] ); pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] ); pr2.s1 = pr0.s2 = F1( top[4], top[5] ); pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] ); pr2.s2 = pr0.s3 = F1( top[5], top[6] ); pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] ); pr2.s3 = pr0.s4 = F1( top[6], top[7] ); pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] ); pr2.s4 = pr0.s5 = F1( top[7], top[8] ); pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] ); pr2.s5 = pr0.s6 = F1( top[8], top[9] ); pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] ); pr2.s6 = pr0.s7 = F1( top[9], top[10] ); pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] ); pr2.s7 = F1( top[10], top[11] ); pr3.s7 = F2( top[10], top[11], top[12] ); return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left ) { int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2; pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2; pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2; pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2; pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); // Lower half of pred[] pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2; pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; pr0.s6 = left[7]; pr0.s7 = left[7]; pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2; pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; pr1.s4 = left[7]; pr1.s5 = left[7]; pr1.s6 = left[7]; pr1.s7 = left[7]; pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2; pr2.s2 = left[7]; pr2.s3 = left[7]; pr2.s4 = left[7]; pr2.s5 = left[7]; pr2.s6 = left[7]; pr2.s7 = left[7]; pr3 = (int8)left[7]; return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8c_h( const local pixel *src, int src_stride ) { const local pixel *src_l = src; int8 pr0, pr1, pr2, pr3; // Upper half of pred[] pr0 = (int8)src[-1]; src += src_stride; pr1 = (int8)src[-1]; src += src_stride; pr2 = (int8)src[-1]; src += src_stride; pr3 = (int8)src[-1]; src += src_stride; int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 ); //Lower half of pred[] pr0 = (int8)src[-1]; src += src_stride; pr1 = (int8)src[-1]; src += src_stride; pr2 = (int8)src[-1]; src += src_stride; pr3 = (int8)src[-1]; return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8c_v( const local pixel *src, int src_stride ) { int8 pred = convert_int8( vload8( 0, &src[-src_stride] )); return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred ); } int x264_predict_8x8c_p( const local pixel *src, int src_stride ) { int H = 0, V = 0; for( int i = 0; i < 4; i++ ) { H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]); V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]); } int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]); int b = (17 * H + 16) >> 5; int c = (17 * V + 16) >> 5; int i00 = a - 3 * b - 3 * c + 16; // Upper half of pred[] int pix = i00; int8 pr0, pr1, pr2, pr3; pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; pix = i00; pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; pix = i00; pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; pix = i00; pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 ); //Lower half of pred[] pix = i00; pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; pix = i00; pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; pix = i00; pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; pix = i00; pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b; pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c; return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 ); } int x264_predict_8x8c_dc( const local pixel *src, int src_stride ) { int s0 = 0, s1 = 0, s2 = 0, s3 = 0; for( int i = 0; i < 4; i++ ) { s0 += src[i - src_stride]; s1 += src[i + 4 - src_stride]; s2 += src[-1 + i * src_stride]; s3 += src[-1 + (i+4)*src_stride]; } // Upper half of pred[] int8 dc0; dc0.lo = (int4)( (s0 + s2 + 4) >> 3 ); dc0.hi = (int4)( (s1 + 2) >> 2 ); int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 ); // Lower half of pred[] dc0.lo = (int4)( (s3 + 2) >> 2 ); dc0.hi = (int4)( (s1 + s3 + 4) >> 3 ); return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 ); } #else /* not vectorized: private is cheap registers are scarce */ int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top ) { private pixel pred[32]; // Upper half of pred[] for( int y = 0; y < 4; y++ ) { for( int x = 0; x < 8; x++ ) { pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 ); pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2; } } int satd = satd_8x4_lp( src, src_stride, pred, 8 ); //Lower half of pred[] for( int y = 4; y < 8; y++ ) { for( int x = 0; x < 8; x++ ) { pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 ); pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2; } } pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2; satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; } int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) { private pixel pred[32]; #define PRED( x, y ) pred[(x) + (y)*8] // Upper half of pred[] PRED( 0, 3 ) = F2( left[1], left[2], left[3] ); PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] ); PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top ); PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] ); PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] ); PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] ); PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] ); PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] ); PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] ); PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] ); PRED( 7, 0 ) = F2( top[5], top[6], top[7] ); int satd = satd_8x4_lp( src, src_stride, pred, 8 ); // Lower half of pred[] PRED( 0, 3 ) = F2( left[5], left[6], left[7] ); PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] ); PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] ); PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] ); PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] ); PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] ); PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top ); PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] ); PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] ); PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] ); PRED( 7, 0 ) = F2( top[1], top[2], top[3] ); satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; #undef PRED } int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) { private pixel pred[32]; #define PRED( x, y ) pred[(x) + (y)*8] // Upper half of pred[] PRED( 0, 2 ) = F2( left[1], left[0], left_top ); PRED( 0, 3 ) = F2( left[2], left[1], left[0] ); PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] ); PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] ); PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] ); PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] ); PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] ); PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] ); PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] ); PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] ); PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] ); PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] ); PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] ); PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] ); PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] ); PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] ); PRED( 7, 1 ) = F2( top[5], top[6], top[7] ); PRED( 7, 0 ) = F1( top[6], top[7] ); int satd = satd_8x4_lp( src, src_stride, pred, 8 ); //Lower half of pred[] PRED( 0, 2 ) = F2( left[5], left[4], left[3] ); PRED( 0, 3 ) = F2( left[6], left[5], left[4] ); PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] ); PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] ); PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top ); PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] ); PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] ); PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] ); PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] ); PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] ); PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] ); PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] ); PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] ); PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] ); PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] ); PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] ); PRED( 7, 1 ) = F2( top[3], top[4], top[5] ); PRED( 7, 0 ) = F1( top[4], top[5] ); satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; #undef PRED } inline uint32_t pack16to32( uint32_t a, uint32_t b ) { return a + (b << 16); } inline uint32_t pack8to16( uint32_t a, uint32_t b ) { return a + (b << 8); } int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top ) { private pixel pred[32]; int satd; int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) ); int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) ); int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) ); int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) ); int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) ); int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) ); int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) ); int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) ); int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) ); int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) ); int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) ); // Upper half of pred[] vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] ); vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] ); vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] ); vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] ); vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] ); vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] ); vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] ); vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] ); satd = satd_8x4_lp( src, src_stride, pred, 8 ); // Lower half of pred[] vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] ); vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] ); vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] ); vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] ); vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] ); vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] ); vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] ); vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] ); satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; } int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top ) { private pixel pred[32]; int satd; #define PRED( x, y ) pred[(x) + (y)*8] // Upper half of pred[] PRED( 0, 0 ) = F1( top[0], top[1] ); PRED( 0, 1 ) = F2( top[0], top[1], top[2] ); PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] ); PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] ); PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] ); PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] ); PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] ); PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] ); PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] ); PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] ); PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] ); PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] ); PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] ); PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] ); PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] ); PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] ); PRED( 7, 2 ) = F1( top[8], top[9] ); PRED( 7, 3 ) = F2( top[8], top[9], top[10] ); satd = satd_8x4_lp( src, src_stride, pred, 8 ); // Lower half of pred[] PRED( 0, 0 ) = F1( top[2], top[3] ); PRED( 0, 1 ) = F2( top[2], top[3], top[4] ); PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] ); PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] ); PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] ); PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] ); PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] ); PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] ); PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] ); PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] ); PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] ); PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] ); PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] ); PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] ); PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] ); PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] ); PRED( 7, 2 ) = F1( top[10], top[11] ); PRED( 7, 3 ) = F2( top[10], top[11], top[12] ); satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; #undef PRED } int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left ) { private pixel pred[32]; int satd; int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) ); int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) ); int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) ); int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) ); int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) ); int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) ); int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) ); int p8 = pack8to16( left[7], left[7] ); // Upper half of pred[] vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] ); vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] ); vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] ); vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] ); vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] ); vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] ); vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] ); vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] ); satd = satd_8x4_lp( src, src_stride, pred, 8 ); // Lower half of pred[] vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] ); vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] ); vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] ); vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] ); vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] ); vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] ); vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] ); vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] ); satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; } int x264_predict_8x8c_h( const local pixel *src, int src_stride ) { private pixel pred[32]; const local pixel *src_l = src; // Upper half of pred[] vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride; vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride; vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride; vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride; int satd = satd_8x4_lp( src_l, src_stride, pred, 8 ); // Lower half of pred[] vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride; vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride; vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride; vstore8( (uchar8)(src[-1]), 3, pred ); return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 ); } int x264_predict_8x8c_v( const local pixel *src, int src_stride ) { private pixel pred[32]; uchar16 v16; v16.lo = vload8( 0, &src[-src_stride] ); v16.hi = vload8( 0, &src[-src_stride] ); vstore16( v16, 0, pred ); vstore16( v16, 1, pred ); return satd_8x4_lp( src, src_stride, pred, 8 ) + satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 ); } int x264_predict_8x8c_p( const local pixel *src, int src_stride ) { int H = 0, V = 0; private pixel pred[32]; int satd; for( int i = 0; i < 4; i++ ) { H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]); V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]); } int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]); int b = (17 * H + 16) >> 5; int c = (17 * V + 16) >> 5; int i00 = a - 3 * b - 3 * c + 16; // Upper half of pred[] for( int y = 0; y < 4; y++ ) { int pix = i00; for( int x = 0; x < 8; x++ ) { pred[x + y*8] = x264_clip_pixel( pix >> 5 ); pix += b; } i00 += c; } satd = satd_8x4_lp( src, src_stride, pred, 8 ); // Lower half of pred[] for( int y = 0; y < 4; y++ ) { int pix = i00; for( int x = 0; x < 8; x++ ) { pred[x + y*8] = x264_clip_pixel( pix >> 5 ); pix += b; } i00 += c; } satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); return satd; } int x264_predict_8x8c_dc( const local pixel *src, int src_stride ) { private pixel pred[32]; int s0 = 0, s1 = 0, s2 = 0, s3 = 0; for( int i = 0; i < 4; i++ ) { s0 += src[i - src_stride]; s1 += src[i + 4 - src_stride]; s2 += src[-1 + i * src_stride]; s3 += src[-1 + (i+4)*src_stride]; } // Upper half of pred[] uchar8 dc0; dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 ); dc0.hi = (uchar4)( (s1 + 2) >> 2 ); vstore8( dc0, 0, pred ); vstore8( dc0, 1, pred ); vstore8( dc0, 2, pred ); vstore8( dc0, 3, pred ); int satd = satd_8x4_lp( src, src_stride, pred, 8 ); // Lower half of pred[] dc0.lo = (uchar4)( (s3 + 2) >> 2 ); dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 ); vstore8( dc0, 0, pred ); vstore8( dc0, 1, pred ); vstore8( dc0, 2, pred ); vstore8( dc0, 3, pred ); return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 ); } #endif /* Find the least cost intra mode for 32 8x8 macroblocks per workgroup * * Loads 33 macroblocks plus the pixels directly above them into local memory, * padding where necessary with edge pixels. It then cooperatively calculates * smoothed top and left pixels for use in some of the analysis. * * Then groups of 32 threads each calculate a single intra mode for each 8x8 * block. Since consecutive threads are calculating the same intra mode there * is no code-path divergence. 8 intra costs are calculated simultaneously. If * the "slow" argument is not zero, the final two (least likely) intra modes are * tested in a second pass. The slow mode is only enabled for presets slow, * slower, and placebo. * * This allows all of the pixels functions to read pixels from local memory, and * avoids re-fetching edge pixels from global memory. And it allows us to * calculate all of the intra mode costs simultaneously without branch divergence. * * Local dimension: [ 32, 8 ] * Global dimensions: [ paddedWidth, height ] */ kernel void mb_intra_cost_satd_8x8( read_only image2d_t fenc, global uint16_t *fenc_intra_cost, global int *frame_stats, int lambda, int mb_width, int slow ) { #define CACHE_STRIDE 265 #define BLOCK_OFFSET 266 local pixel cache[2385]; local int cost_buf[32]; local pixel top[32 * 16]; local pixel left[32 * 8]; local pixel left_top[32]; int lx = get_local_id( 0 ); int ly = get_local_id( 1 ); int gx = get_global_id( 0 ); int gy = get_global_id( 1 ); int gidx = get_group_id( 0 ); int gidy = get_group_id( 1 ); int linear_id = ly * get_local_size( 0 ) + lx; int satd = COST_MAX; int basex = gidx << 8; int basey = (gidy << 3) - 1; /* Load 33 8x8 macroblocks and the pixels above them into local cache */ for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ ) { int x = linear_id << 2; uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) ); cache[y * CACHE_STRIDE + 1 + x] = data.s0; cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1; cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2; cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3; } /* load pixels on left edge */ if( linear_id < 9 ) cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0; barrier( CLK_LOCAL_MEM_FENCE ); // Cooperatively build the top edge for the macroblock using lowpass filter int j = ly; top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] + 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2; j += 8; top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] + 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2; // Cooperatively build the left edge for the macroblock using lowpass filter left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] + 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] + cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2; // One left_top per macroblock if( 0 == ly ) { left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] + cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2; cost_buf[lx] = COST_MAX; } barrier( CLK_LOCAL_MEM_FENCE ); // each warp/wavefront generates a different prediction type; no divergence switch( ly ) { case 0: satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); break; case 1: satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); break; case 2: satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); break; case 3: satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE ); break; case 4: satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] ); break; case 5: satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] ); break; case 6: satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] ); break; case 7: satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] ); break; default: break; } atom_min( &cost_buf[lx], satd ); if( slow ) { // Do the remaining two (least likely) prediction modes switch( ly ) { case 0: // DDL satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] ); atom_min( &cost_buf[lx], satd ); break; case 1: // VL satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] ); atom_min( &cost_buf[lx], satd ); break; default: break; } } barrier( CLK_LOCAL_MEM_FENCE ); if( (0 == ly) && (gx < mb_width) ) fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda; // initialize the frame_stats[2] buffer for kernel sum_intra_cost(). if( gx < 2 && gy == 0 ) frame_stats[gx] = 0; #undef CACHE_STRIDE #undef BLOCK_OFFSET } /* * parallel sum intra costs * * global launch dimensions: [256, mb_height] */ kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost, const global uint16_t *inv_qscale_factor, global int *fenc_row_satds, global int *frame_stats, int mb_width ) { int y = get_global_id( 1 ); int mb_height = get_global_size( 1 ); int row_satds = 0; int cost_est = 0; int cost_est_aq = 0; for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 )) { int mb_xy = x + y * mb_width; int cost = fenc_intra_cost[mb_xy]; int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8; int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2; row_satds += cost_aq; if( b_frame_score_mb ) { cost_est += cost; cost_est_aq += cost_aq; } } local int buffer[256]; int x = get_global_id( 0 ); row_satds = parallel_sum( row_satds, x, buffer ); cost_est = parallel_sum( cost_est, x, buffer ); cost_est_aq = parallel_sum( cost_est_aq, x, buffer ); if( get_global_id( 0 ) == 0 ) { fenc_row_satds[y] = row_satds; atomic_add( frame_stats + COST_EST, cost_est ); atomic_add( frame_stats + COST_EST_AQ, cost_est_aq ); } } x264-master/common/opencl/motionsearch.cl000066400000000000000000000215041502133446700206440ustar00rootroot00000000000000/* Hierarchical (iterative) OpenCL lowres motion search */ inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height ) { /* edge macroblocks might not have a direct descendant, use nearest */ x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 ); y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 ); return (mb_width>>1) * y + x; } /* Four threads calculate an 8x8 SAD. Each does two rows */ int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs ) { frefpos.y += idx << 1; fencpos.y += idx << 1; int cost = 0; if( frefpos.x < 0 ) { /* slow path when MV goes past left edge. The GPU clamps reads from * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really * want are [0, 0, 1, 2] */ for( int y = 0; y < 2; y++ ) { for( int x = 0; x < 8; x++ ) { pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0; pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0; cost += abs_diff( enc, ref ); } } } else { uint4 enc, ref, costs = 0; enc = read_imageui( fenc, sampler, fencpos ); ref = read_imageui( fref, sampler, frefpos ); costs += abs_diff( enc, ref ); enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) ); ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) ); costs += abs_diff( enc, ref ); enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) ); ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) ); costs += abs_diff( enc, ref ); enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) ); ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) ); costs += abs_diff( enc, ref ); cost = costs.s0 + costs.s1 + costs.s2 + costs.s3; } costs[idx] = cost; return costs[0] + costs[1] + costs[2] + costs[3]; } /* One thread performs 8x8 SAD */ int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos ) { if( frefpos.x < 0 ) { /* slow path when MV goes past left edge */ int cost = 0; for( int y = 0; y < 8; y++ ) { for( int x = 0; x < 8; x++ ) { uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0; uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0; cost += abs_diff( enc, ref ); } } return cost; } else { uint4 enc, ref, cost = 0; for( int y = 0; y < 8; y++ ) { for( int x = 0; x < 8; x += 4 ) { enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ); ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ); cost += abs_diff( enc, ref ); } } return cost.s0 + cost.s1 + cost.s2 + cost.s3; } } /* * hierarchical motion estimation * * Each kernel launch is a single iteration * * MB per work group is determined by lclx / 4 * lcly * * global launch dimensions: [mb_width * 4, mb_height] */ kernel void hierarchical_motion( read_only image2d_t fenc, read_only image2d_t fref, const global short2 *in_mvs, global short2 *out_mvs, global int16_t *out_mv_costs, global short2 *mvp_buffer, local int16_t *cost_local, local short2 *mvc_local, int mb_width, int lambda, int me_range, int scale, int b_shift_index, int b_first_iteration, int b_reverse_references ) { int mb_x = get_global_id( 0 ) >> 2; if( mb_x >= mb_width ) return; int mb_height = get_global_size( 1 ); int mb_i = get_global_id( 0 ) & 3; int mb_y = get_global_id( 1 ); int mb_xy = mb_y * mb_width + mb_x; const int mb_size = 8; int2 coord = (int2)(mb_x, mb_y) * mb_size; const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); cost_local += 4 * mb_in_group; int i_mvc = 0; mvc_local += 4 * mb_in_group; mvc_local[mb_i] = 0; int2 mvp =0; if( !b_first_iteration ) { #define MVC( DX, DY )\ {\ int px = mb_x + DX;\ int py = mb_y + DY;\ mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \ in_mvs[mb_width * py + px];\ mvc_local[i_mvc] >>= (short) scale;\ i_mvc++;\ } /* Find MVP from median of MVCs */ if( b_reverse_references ) { /* odd iterations: derive MVP from down and right */ if( mb_x < mb_width - 1 ) MVC( 1, 0 ); if( mb_y < mb_height - 1 ) { MVC( 0, 1 ); if( mb_x > b_shift_index ) MVC( -1, 1 ); if( mb_x < mb_width - 1 ) MVC( 1, 1 ); } } else { /* even iterations: derive MVP from up and left */ if( mb_x > 0 ) MVC( -1, 0 ); if( mb_y > 0 ) { MVC( 0, -1 ); if( mb_x < mb_width - 1 ) MVC( 1, -1 ); if( mb_x > b_shift_index ) MVC( -1, -1 ); } } #undef MVC mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] ); } /* current mvp matches the previous mvp and we have not changed scale. We know * we're going to arrive at the same MV again, so just copy the previous * result to our output. */ if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y ) { out_mvs[mb_xy] = in_mvs[mb_xy]; return; } mvp_buffer[mb_xy] = convert_short2_sat(mvp); int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4; int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4; int2 bestmv = clamp(mvp, mv_min, mv_max); int2 refcrd = coord + bestmv; /* measure cost at bestmv */ int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) + lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) ); do { /* measure costs at offsets from bestmv */ refcrd = coord + bestmv + dia_offs[mb_i]; int2 trymv = bestmv + dia_offs[mb_i]; int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) + lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) ); cost_local[mb_i] = (cost<<2) | mb_i; cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) ); if( (cost >> 2) >= bcost ) break; bestmv += dia_offs[cost&3]; bcost = cost>>2; if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y ) break; } while( --me_range > 0 ); int2 trymv = 0, diff = 0; #define COST_MV_NO_PAD( L )\ trymv = clamp( trymv, mv_min, mv_max );\ diff = convert_int2_sat(abs_diff( mvp, trymv ));\ if( diff.x > 1 || diff.y > 1 ) {\ int2 refcrd = coord + trymv;\ int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\ L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\ if( cost < bcost ) { bcost = cost; bestmv = trymv; } } COST_MV_NO_PAD( 0 ); if( !b_first_iteration ) { /* try cost at previous iteration's MV, if MVP was too far away */ int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]); prevmv >>= scale; trymv = prevmv; COST_MV_NO_PAD( lambda ); } for( int i = 0; i < i_mvc; i++ ) { /* try cost at each candidate MV, if MVP was too far away */ trymv = convert_int2_sat( mvc_local[i] ); COST_MV_NO_PAD( lambda ); } if( mb_i == 0 ) { bestmv <<= scale; out_mvs[mb_xy] = convert_short2_sat(bestmv); out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ); } } x264-master/common/opencl/subpel.cl000066400000000000000000000217501502133446700174460ustar00rootroot00000000000000/* OpenCL lowres subpel Refine */ /* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are * calculated simultaneously */ int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos ) { int2 frefpos = qpos >> 2; int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2); uint mask_shift = 8 * hpel_idx; uint4 cost4 = 0; for( int y = 0; y < 8; y++ ) { uint4 enc, val4; enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y)); val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF; val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF; val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF; val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF; cost4 += abs_diff( enc, val4 ); enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y)); val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF; val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF; val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF; val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF; cost4 += abs_diff( enc, val4 ); } return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3; } /* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */ int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos ) { int2 frefApos = qpos >> 2; int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2); int2 qposB = qpos + ((qpos & 1) << 1); int2 frefBpos = qposB >> 2; int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2); uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB; int cost = 0; for( int y = 0; y < 8; y++ ) { for( int x = 0; x < 8; x++ ) { uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0; uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF; uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF; cost += abs_diff( enc, rhadd( vA, vB ) ); } } return cost; } /* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane * * Each thread collects 1/4 of the rows of diffs and processes one quarter of * the transforms */ int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos, local sum2_t *tmpp, int idx ) { volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp; sum2_t b0, b1, b2, b3; // fencpos is full-pel position of original MB // qpos is qpel position within reference frame int2 frefApos = qpos >> 2; int hpelA = ((qpos.x&2)>>1) + (qpos.y&2); int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1)); int2 frefBpos = qposB >> 2; int hpelB = ((qposB.x&2)>>1) + (qposB.y&2); uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB; uint vA, vB; uint a0, a1; uint enc; sum2_t sum = 0; #define READ_DIFF( OUT, X )\ enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\ vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\ vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\ OUT = enc - rhadd( vA, vB ); #define READ_DIFF_EX( OUT, a, b )\ {\ READ_DIFF( a0, a );\ READ_DIFF( a1, b );\ OUT = a0 + (a1<>BITS_PER_SUM)) >> 1; } constant int2 hpoffs[4] = { {0, -2}, {-2, 0}, {2, 0}, {0, 2} }; /* sub pixel refinement of motion vectors, output MVs and costs are moved from * temporary buffers into final per-frame buffer * * global launch dimensions: [mb_width * 4, mb_height] * * With X being the source 16x16 pixels, F is the lowres pixel used by the * motion search. We will now utilize the H V and C pixels (stored in separate * planes) to search at half-pel increments. * * X X X X X X * F H F H F * X X X X X X * V C V C V * X X X X X X * F H F H F * X X X X X X * * The YX HPEL bits of the motion vector selects the plane we search in. The * four planes are packed in the fref_planes 2D image buffer. Each sample * returns: s0 = F, s1 = H, s2 = V, s3 = C */ kernel void subpel_refine( read_only image2d_t fenc, read_only image2d_t fref_planes, const global short2 *in_mvs, const global int16_t *in_sad_mv_costs, local int16_t *cost_local, local sum2_t *satd_local, local short2 *mvc_local, global short2 *fenc_lowres_mv, global int16_t *fenc_lowres_mv_costs, int mb_width, int lambda, int b, int ref, int b_islist1 ) { int mb_x = get_global_id( 0 ) >> 2; if( mb_x >= mb_width ) return; int mb_height = get_global_size( 1 ); int mb_i = get_global_id( 0 ) & 3; int mb_y = get_global_id( 1 ); int mb_xy = mb_y * mb_width + mb_x; /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that * hold many frames worth of motion vectors. We must offset into the correct * location for this frame's vectors. The kernel will be passed the correct * directional buffer for the direction of the search: list1 or list0 * * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1] * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */ fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height; fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height; /* Adjust pointers into local memory buffers for this thread's data */ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2); cost_local += mb_in_group * 4; satd_local += mb_in_group * 16; mvc_local += mb_in_group * 4; int i_mvc = 0; mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0; #define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)]; if( mb_x > 0 ) MVC( -1, 0 ); if( mb_y > 0 ) { MVC( 0, -1 ); if( mb_x < mb_width - 1 ) MVC( 1, -1 ); if( mb_x > 0 ) MVC( -1, -1 ); } #undef MVC int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] ); int bcost = in_sad_mv_costs[mb_xy]; int2 coord = (int2)(mb_x, mb_y) << 3; int2 bmv = convert_int2_sat( in_mvs[mb_xy] ); /* Make mvp and bmv QPEL MV */ mvp <<= 2; bmv <<= 2; #define HPEL_QPEL( ARR, FUNC )\ {\ int2 trymv = bmv + ARR[mb_i];\ int2 qpos = (coord << 2) + trymv;\ int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\ cost_local[mb_i] = (cost<<2) + mb_i;\ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\ if( (cost>>2) < bcost )\ {\ bmv += ARR[cost&3];\ bcost = cost>>2;\ }\ } HPEL_QPEL( hpoffs, sad_8x8_ii_hpel ); HPEL_QPEL( dia_offs, sad_8x8_ii_qpel ); fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv ); /* remeasure cost of bmv using SATD */ int2 qpos = (coord << 2) + bmv; cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i ); bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3]; bcost += lambda * mv_cost( abs_diff( bmv, mvp ) ); fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ); } x264-master/common/opencl/weightp.cl000066400000000000000000000032651502133446700176240ustar00rootroot00000000000000/* Weightp filter a downscaled image into a temporary output buffer. * This kernel is launched once for each scale. * * Launch dimensions: width x height (in pixels) */ kernel void weightp_scaled_images( read_only image2d_t in_plane, write_only image2d_t out_plane, uint offset, uint scale, uint denom ) { int gx = get_global_id( 0 ); int gy = get_global_id( 1 ); uint4 input_val; uint4 output_val; input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)); output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) ); write_imageui( out_plane, (int2)(gx, gy), output_val ); } /* Weightp filter for the half-pel interpolated image * * Launch dimensions: width x height (in pixels) */ kernel void weightp_hpel( read_only image2d_t in_plane, write_only image2d_t out_plane, uint offset, uint scale, uint denom ) { int gx = get_global_id( 0 ); int gy = get_global_id( 1 ); uint input_val; uint output_val; input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0; //Unpack uint4 temp; temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff; temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff; temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) ); //Pack output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24); write_imageui( out_plane, (int2)(gx, gy), output_val ); } x264-master/common/opencl/x264-cl.h000066400000000000000000000066271502133446700171120ustar00rootroot00000000000000#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; /* 7.18.1.1 Exact-width integer types */ typedef signed char int8_t; typedef unsigned char uint8_t; typedef short int16_t; typedef unsigned short uint16_t; typedef int int32_t; typedef unsigned uint32_t; typedef uint8_t pixel; typedef uint16_t sum_t; typedef uint32_t sum2_t; #define LOWRES_COST_MASK ((1<<14)-1) #define LOWRES_COST_SHIFT 14 #define COST_MAX (1<<28) #define PIXEL_MAX 255 #define BITS_PER_SUM (8 * sizeof(sum_t)) /* Constants for offsets into frame statistics buffer */ #define COST_EST 0 #define COST_EST_AQ 1 #define INTRA_MBS 2 #define COPY2_IF_LT( x, y, a, b )\ if( (y) < (x) )\ {\ (x) = (y);\ (a) = (b);\ } constant int2 dia_offs[4] = { {0, -1}, {-1, 0}, {1, 0}, {0, 1}, }; inline pixel x264_clip_pixel( int x ) { return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX ); } inline int2 x264_median_mv( short2 a, short2 b, short2 c ) { short2 t1 = min(a, b); short2 t2 = min(max(a, b), c); return convert_int2(max(t1, t2)); } inline sum2_t abs2( sum2_t a ) { sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1); return (a + s) ^ s; } #define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\ sum2_t t0 = s0 + s1;\ sum2_t t1 = s0 - s1;\ sum2_t t2 = s2 + s3;\ sum2_t t3 = s2 - s3;\ d0 = t0 + t2;\ d2 = t0 - t2;\ d1 = t1 + t3;\ d3 = t1 - t3;\ } #define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\ int2 t0 = s0 + s1;\ int2 t1 = s0 - s1;\ int2 t2 = s2 + s3;\ int2 t3 = s2 - s3;\ d0 = t0 + t2;\ d2 = t0 - t2;\ d1 = t1 + t3;\ d3 = t1 - t3;\ } #define SATD_C_8x4_Q( name, q1, q2 )\ int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\ {\ sum2_t tmp[4][4];\ sum2_t a0, a1, a2, a3;\ sum2_t sum = 0;\ for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\ {\ a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\ a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\ a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\ a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\ HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\ }\ for( int i = 0; i < 4; i++ )\ {\ HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\ sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\ }\ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\ } /* * Utility function to perform a parallel sum reduction of an array of integers */ int parallel_sum( int value, int x, volatile local int *array ) { array[x] = value; barrier( CLK_LOCAL_MEM_FENCE ); int dim = get_local_size( 0 ); while( dim > 1 ) { dim >>= 1; if( x < dim ) array[x] += array[x + dim]; if( dim > 32 ) barrier( CLK_LOCAL_MEM_FENCE ); } return array[0]; } int mv_cost( uint2 mvd ) { float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f; float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) ); return (int) (cost.x + cost.y); } x264-master/common/osdep.c000066400000000000000000000062611502133446700156320ustar00rootroot00000000000000/***************************************************************************** * osdep.c: platform-specific code ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Steven Walters * Laurent Aimar * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "osdep.h" #if SYS_WINDOWS #include #include #else #include #endif #include #if PTW32_STATIC_LIB /* this is a global in pthread-win32 to indicate if it has been initialized or not */ extern int ptw32_processInitialized; #endif int64_t x264_mdate( void ) { #if SYS_WINDOWS struct timeb tb; ftime( &tb ); return ((int64_t)tb.time * 1000 + (int64_t)tb.millitm) * 1000; #elif HAVE_CLOCK_GETTIME struct timespec ts; clock_gettime( CLOCK_MONOTONIC, &ts ); return (int64_t)ts.tv_sec * 1000000 + (int64_t)ts.tv_nsec / 1000; #else struct timeval tv_date; gettimeofday( &tv_date, NULL ); return (int64_t)tv_date.tv_sec * 1000000 + (int64_t)tv_date.tv_usec; #endif } #if HAVE_WIN32THREAD || PTW32_STATIC_LIB /* state of the threading library being initialized */ static volatile LONG threading_is_init = 0; static void threading_destroy( void ) { #if PTW32_STATIC_LIB pthread_win32_thread_detach_np(); pthread_win32_process_detach_np(); #else x264_win32_threading_destroy(); #endif } static int threading_init( void ) { #if PTW32_STATIC_LIB /* if static pthread-win32 is already initialized, then do nothing */ if( ptw32_processInitialized ) return 0; if( !pthread_win32_process_attach_np() ) return -1; #else if( x264_win32_threading_init() ) return -1; #endif /* register cleanup to run at process termination */ atexit( threading_destroy ); return 0; } int x264_threading_init( void ) { LONG state; while( (state = InterlockedCompareExchange( &threading_is_init, -1, 0 )) != 0 ) { /* if already init, then do nothing */ if( state > 0 ) return 0; } if( threading_init() < 0 ) { InterlockedExchange( &threading_is_init, 0 ); return -1; } InterlockedExchange( &threading_is_init, 1 ); return 0; } #endif x264-master/common/osdep.h000066400000000000000000000411551502133446700156400ustar00rootroot00000000000000/***************************************************************************** * osdep.h: platform-specific code ***************************************************************************** * Copyright (C) 2007-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_OSDEP_H #define X264_OSDEP_H #define _LARGEFILE_SOURCE 1 #define _FILE_OFFSET_BITS 64 #include #include #include #include #include #include "config.h" #ifdef __INTEL_COMPILER #include #else #include #endif #ifdef _WIN32 #include #include #endif #include "x264.h" #if !HAVE_LOG2F #define log2f(x) (logf(x)/0.693147180559945f) #define log2(x) (log(x)/0.693147180559945) #endif #ifdef _MSC_VER #define inline __inline #define strcasecmp _stricmp #define strncasecmp _strnicmp #define strtok_r strtok_s #define S_ISREG(x) (((x) & S_IFMT) == S_IFREG) #else #include #endif #if !defined(va_copy) && defined(__INTEL_COMPILER) #define va_copy(dst, src) ((dst) = (src)) #endif #if !defined(isfinite) && (SYS_OPENBSD || SYS_SunOS) #define isfinite finite #endif #if !HAVE_STRTOK_R && !defined(strtok_r) #define strtok_r(str,delim,save) strtok(str,delim) #endif #if defined(_MSC_VER) && _MSC_VER < 1900 /* MSVC pre-VS2015 has broken snprintf/vsnprintf implementations which are incompatible with C99. */ static inline int x264_vsnprintf( char *s, size_t n, const char *fmt, va_list arg ) { int length = -1; if( n ) { va_list arg2; va_copy( arg2, arg ); length = _vsnprintf( s, n, fmt, arg2 ); va_end( arg2 ); /* _(v)snprintf adds a null-terminator only if the length is less than the buffer size. */ if( length < 0 || length >= n ) s[n-1] = '\0'; } /* _(v)snprintf returns a negative number if the length is greater than the buffer size. */ if( length < 0 ) return _vscprintf( fmt, arg ); return length; } static inline int x264_snprintf( char *s, size_t n, const char *fmt, ... ) { va_list arg; va_start( arg, fmt ); int length = x264_vsnprintf( s, n, fmt, arg ); va_end( arg ); return length; } #define snprintf x264_snprintf #define vsnprintf x264_vsnprintf #endif #ifdef _WIN32 /* Functions for dealing with Unicode on Windows. */ static inline wchar_t *x264_utf8_to_utf16( const char *utf8 ) { int len = MultiByteToWideChar( CP_UTF8, MB_ERR_INVALID_CHARS, utf8, -1, NULL, 0 ); if( len ) { wchar_t *utf16 = malloc( len * sizeof( wchar_t ) ); if( utf16 ) { if( MultiByteToWideChar( CP_UTF8, MB_ERR_INVALID_CHARS, utf8, -1, utf16, len ) ) return utf16; free( utf16 ); } } return NULL; } static inline wchar_t *x264_utf8_to_utf16_try_buf( const char *utf8, wchar_t *buf_utf16, int buf_len ) { if( MultiByteToWideChar( CP_UTF8, MB_ERR_INVALID_CHARS, utf8, -1, buf_utf16, buf_len ) ) return buf_utf16; return x264_utf8_to_utf16( utf8 ); } #define x264_fopen( filename, mode ) x264_fopen_internal( filename, L##mode ) static inline FILE *x264_fopen_internal( const char *filename, const wchar_t *mode_utf16 ) { FILE *f = NULL; wchar_t filename_buf[MAX_PATH]; wchar_t *filename_utf16 = x264_utf8_to_utf16_try_buf( filename, filename_buf, MAX_PATH ); if( filename_utf16 ) { f = _wfopen( filename_utf16, mode_utf16 ); if( filename_utf16 != filename_buf ) free( filename_utf16 ); } return f; } static inline int x264_rename( const char *oldname, const char *newname ) { int ret = -1; wchar_t oldname_buf[MAX_PATH]; wchar_t *oldname_utf16 = x264_utf8_to_utf16_try_buf( oldname, oldname_buf, MAX_PATH ); if( oldname_utf16 ) { wchar_t newname_buf[MAX_PATH]; wchar_t *newname_utf16 = x264_utf8_to_utf16_try_buf( newname, newname_buf, MAX_PATH ); if( newname_utf16 ) { /* POSIX says that rename() removes the destination, but Win32 doesn't. */ _wunlink( newname_utf16 ); ret = _wrename( oldname_utf16, newname_utf16 ); if( newname_utf16 != newname_buf ) free( newname_utf16 ); } if( oldname_utf16 != oldname_buf ) free( oldname_utf16 ); } return ret; } #define x264_struct_stat struct _stati64 #define x264_fstat _fstati64 static inline int x264_stat( const char *path, x264_struct_stat *buf ) { int ret = -1; wchar_t path_buf[MAX_PATH]; wchar_t *path_utf16 = x264_utf8_to_utf16_try_buf( path, path_buf, MAX_PATH ); if( path_utf16 ) { ret = _wstati64( path_utf16, buf ); if( path_utf16 != path_buf ) free( path_utf16 ); } return ret; } #else #define x264_fopen fopen #define x264_rename rename #define x264_struct_stat struct stat #define x264_fstat fstat #define x264_stat stat #endif /* mdate: return the current date in microsecond */ X264_API int64_t x264_mdate( void ); #if defined(_WIN32) && !HAVE_WINRT static inline int x264_vfprintf( FILE *stream, const char *format, va_list arg ) { HANDLE console = NULL; DWORD mode; if( stream == stdout ) console = GetStdHandle( STD_OUTPUT_HANDLE ); else if( stream == stderr ) console = GetStdHandle( STD_ERROR_HANDLE ); /* Only attempt to convert to UTF-16 when writing to a non-redirected console screen buffer. */ if( GetConsoleMode( console, &mode ) ) { char buf[4096]; wchar_t buf_utf16[4096]; va_list arg2; va_copy( arg2, arg ); int length = vsnprintf( buf, sizeof(buf), format, arg2 ); va_end( arg2 ); if( length > 0 && (unsigned)length < sizeof(buf) ) { /* WriteConsoleW is the most reliable way to output Unicode to a console. */ int length_utf16 = MultiByteToWideChar( CP_UTF8, 0, buf, length, buf_utf16, sizeof(buf_utf16)/sizeof(wchar_t) ); DWORD written; WriteConsoleW( console, buf_utf16, length_utf16, &written, NULL ); return length; } } return vfprintf( stream, format, arg ); } static inline int x264_is_regular_file_path( const char *path ) { int ret = -1; wchar_t path_buf[MAX_PATH]; wchar_t *path_utf16 = x264_utf8_to_utf16_try_buf( path, path_buf, MAX_PATH ); if( path_utf16 ) { x264_struct_stat buf; ret = !(WaitNamedPipeW( path_utf16, 0 ) || GetLastError() == ERROR_SEM_TIMEOUT); if( ret && !_wstati64( path_utf16, &buf ) ) ret = S_ISREG( buf.st_mode ); if( path_utf16 != path_buf ) free( path_utf16 ); } return ret; } #else #define x264_vfprintf vfprintf static inline int x264_is_regular_file_path( const char *filename ) { x264_struct_stat file_stat; if( x264_stat( filename, &file_stat ) ) return 1; return S_ISREG( file_stat.st_mode ); } #endif static inline int x264_is_regular_file( FILE *filehandle ) { x264_struct_stat file_stat; if( x264_fstat( fileno( filehandle ), &file_stat ) ) return 1; return S_ISREG( file_stat.st_mode ); } #define x264_glue3_expand(x,y,z) x##_##y##_##z #define x264_glue3(x,y,z) x264_glue3_expand(x,y,z) #ifdef _MSC_VER #define DECLARE_ALIGNED( var, n ) __declspec(align(n)) var #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) #endif #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) #define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) #define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) // ARM compilers don't reliably align stack variables // - EABI requires only 8 byte stack alignment to be maintained // - gcc can't align stack variables to more even if the stack were to be correctly aligned outside the function // - armcc can't either, but is nice enough to actually tell you so // - Apple gcc only maintains 4 byte alignment // - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils... #define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\ uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \ type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask) #if ARCH_ARM && SYS_MACOSX #define ALIGNED_ARRAY_8( ... ) EXPAND( ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) ) #else #define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ ) #endif #if ARCH_ARM #define ALIGNED_ARRAY_16( ... ) EXPAND( ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) ) #else #define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ ) #endif #define EXPAND(x) x #if ARCH_X86 || ARCH_X86_64 || ARCH_LOONGARCH #define NATIVE_ALIGN 64 #define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) #define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 ) #if STACK_ALIGNMENT >= 32 #define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) #endif #if STACK_ALIGNMENT >= 64 #define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) #endif #else #define NATIVE_ALIGN 16 #define ALIGNED_32 ALIGNED_16 #define ALIGNED_64 ALIGNED_16 #define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16 #define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16 #endif #if STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4) #define REALIGN_STACK __attribute__((force_align_arg_pointer)) #else #define REALIGN_STACK #endif #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) #define UNUSED __attribute__((unused)) #define ALWAYS_INLINE __attribute__((always_inline)) inline #define NOINLINE __attribute__((noinline)) #define MAY_ALIAS __attribute__((may_alias)) #define x264_constant_p(x) __builtin_constant_p(x) #define x264_nonconstant_p(x) (!__builtin_constant_p(x)) #else #ifdef _MSC_VER #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) #else #define ALWAYS_INLINE inline #define NOINLINE #endif #define UNUSED #define MAY_ALIAS #define x264_constant_p(x) 0 #define x264_nonconstant_p(x) 0 #endif /* threads */ #if HAVE_BEOSTHREAD #include #define x264_pthread_t thread_id static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(void *), void *d ) { *t = spawn_thread( f, "", 10, d ); if( *t < B_NO_ERROR ) return -1; resume_thread( *t ); return 0; } #define x264_pthread_join(t,s) { long tmp; \ wait_for_thread(t,(s)?(long*)(s):&tmp); } #elif HAVE_POSIXTHREAD #include #define x264_pthread_t pthread_t #define x264_pthread_create pthread_create #define x264_pthread_join pthread_join #define x264_pthread_mutex_t pthread_mutex_t #define x264_pthread_mutex_init pthread_mutex_init #define x264_pthread_mutex_destroy pthread_mutex_destroy #define x264_pthread_mutex_lock pthread_mutex_lock #define x264_pthread_mutex_unlock pthread_mutex_unlock #define x264_pthread_cond_t pthread_cond_t #define x264_pthread_cond_init pthread_cond_init #define x264_pthread_cond_destroy pthread_cond_destroy #define x264_pthread_cond_broadcast pthread_cond_broadcast #define x264_pthread_cond_wait pthread_cond_wait #define x264_pthread_attr_t pthread_attr_t #define x264_pthread_attr_init pthread_attr_init #define x264_pthread_attr_destroy pthread_attr_destroy #define x264_pthread_num_processors_np pthread_num_processors_np #define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #elif HAVE_WIN32THREAD #include "win32thread.h" #else #define x264_pthread_t int #define x264_pthread_create(t,u,f,d) 0 #define x264_pthread_join(t,s) #endif //HAVE_*THREAD #if !HAVE_POSIXTHREAD && !HAVE_WIN32THREAD #define x264_pthread_mutex_t int #define x264_pthread_mutex_init(m,f) 0 #define x264_pthread_mutex_destroy(m) #define x264_pthread_mutex_lock(m) #define x264_pthread_mutex_unlock(m) #define x264_pthread_cond_t int #define x264_pthread_cond_init(c,f) 0 #define x264_pthread_cond_destroy(c) #define x264_pthread_cond_broadcast(c) #define x264_pthread_cond_wait(c,m) #define x264_pthread_attr_t int #define x264_pthread_attr_init(a) 0 #define x264_pthread_attr_destroy(a) #define X264_PTHREAD_MUTEX_INITIALIZER 0 #endif #if HAVE_WIN32THREAD || PTW32_STATIC_LIB X264_API int x264_threading_init( void ); #else #define x264_threading_init() 0 #endif static ALWAYS_INLINE int x264_pthread_fetch_and_add( int *val, int add, x264_pthread_mutex_t *mutex ) { #if HAVE_THREAD #if defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ > 0) && HAVE_SYNC_FETCH_AND_ADD return __sync_fetch_and_add( val, add ); #else x264_pthread_mutex_lock( mutex ); int res = *val; *val += add; x264_pthread_mutex_unlock( mutex ); return res; #endif #else int res = *val; *val += add; return res; #endif } #define WORD_SIZE sizeof(void*) #define asm __asm__ #if WORDS_BIGENDIAN #define endian_fix(x) (x) #define endian_fix64(x) (x) #define endian_fix32(x) (x) #define endian_fix16(x) (x) #else #if HAVE_X86_INLINE_ASM && HAVE_MMX static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) { asm("bswap %0":"+r"(x)); return x; } #elif defined(__GNUC__) && HAVE_ARMV6 static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) { asm("rev %0, %0":"+r"(x)); return x; } #else static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) { return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24); } #endif #if HAVE_X86_INLINE_ASM && ARCH_X86_64 static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x ) { asm("bswap %0":"+r"(x)); return x; } #else static ALWAYS_INLINE uint64_t endian_fix64( uint64_t x ) { return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32); } #endif static ALWAYS_INLINE uintptr_t endian_fix( uintptr_t x ) { return WORD_SIZE == 8 ? endian_fix64(x) : endian_fix32(x); } static ALWAYS_INLINE uint16_t endian_fix16( uint16_t x ) { return (uint16_t)((x<<8)|(x>>8)); } #endif /* For values with 4 bits or less. */ static ALWAYS_INLINE int x264_ctz_4bit( uint32_t x ) { static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0}; return lut[x]; } #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 3) #define x264_clz(x) __builtin_clz(x) #define x264_ctz(x) __builtin_ctz(x) #else static ALWAYS_INLINE int x264_clz( uint32_t x ) { static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0}; int y, z = (((x >> 16) - 1) >> 27) & 16; x >>= z^16; z += y = ((x - 0x100) >> 28) & 8; x >>= y^8; z += y = ((x - 0x10) >> 29) & 4; x >>= y^4; return z + lut[x]; } static ALWAYS_INLINE int x264_ctz( uint32_t x ) { static uint8_t lut[16] = {4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0}; int y, z = (((x & 0xffff) - 1) >> 27) & 16; x >>= z; z += y = (((x & 0xff) - 1) >> 28) & 8; x >>= y; z += y = (((x & 0xf) - 1) >> 29) & 4; x >>= y; return z + lut[x&0xf]; } #endif #if HAVE_X86_INLINE_ASM && HAVE_MMX /* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of * using complex address modes properly unless we use inline asm. */ static ALWAYS_INLINE void x264_prefetch( void *p ) { asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) ); } /* We require that prefetch not fault on invalid reads, so we only enable it on * known architectures. */ #elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\ (ARCH_X86 || ARCH_X86_64 || ARCH_ARM || ARCH_PPC) #define x264_prefetch(x) __builtin_prefetch(x) #else #define x264_prefetch(x) #endif #endif /* X264_OSDEP_H */ x264-master/common/pixel.c000066400000000000000000001751011502133446700156410ustar00rootroot00000000000000/***************************************************************************** * pixel.c: pixel metrics ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #if HAVE_MMX # include "x86/pixel.h" # include "x86/predict.h" #endif #if HAVE_ALTIVEC # include "ppc/pixel.h" #endif #if HAVE_ARMV6 # include "arm/pixel.h" # include "arm/predict.h" #endif #if HAVE_AARCH64 # include "aarch64/pixel.h" # include "aarch64/predict.h" #endif #if HAVE_MSA # include "mips/pixel.h" #endif #if HAVE_LSX # include "loongarch/pixel.h" #endif /**************************************************************************** * pixel_sad_WxH ****************************************************************************/ #define PIXEL_SAD_C( name, lx, ly ) \ static int name( pixel *pix1, intptr_t i_stride_pix1, \ pixel *pix2, intptr_t i_stride_pix2 ) \ { \ int i_sum = 0; \ for( int y = 0; y < ly; y++ ) \ { \ for( int x = 0; x < lx; x++ ) \ { \ i_sum += abs( pix1[x] - pix2[x] ); \ } \ pix1 += i_stride_pix1; \ pix2 += i_stride_pix2; \ } \ return i_sum; \ } PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 ) PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 ) PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 ) PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 ) PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 ) PIXEL_SAD_C( x264_pixel_sad_4x16, 4, 16 ) PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 ) PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 ) /**************************************************************************** * pixel_ssd_WxH ****************************************************************************/ #define PIXEL_SSD_C( name, lx, ly ) \ static int name( pixel *pix1, intptr_t i_stride_pix1, \ pixel *pix2, intptr_t i_stride_pix2 ) \ { \ int i_sum = 0; \ for( int y = 0; y < ly; y++ ) \ { \ for( int x = 0; x < lx; x++ ) \ { \ int d = pix1[x] - pix2[x]; \ i_sum += d*d; \ } \ pix1 += i_stride_pix1; \ pix2 += i_stride_pix2; \ } \ return i_sum; \ } PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 ) PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 ) PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 ) PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 ) PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) PIXEL_SSD_C( x264_pixel_ssd_4x16, 4, 16 ) PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, int i_width, int i_height ) { uint64_t i_ssd = 0; int y; int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15); #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \ pix2 + y*i_pix2 + x, i_pix2 ); for( y = 0; y < i_height-15; y += 16 ) { int x = 0; if( align ) for( ; x < i_width-15; x += 16 ) SSD(PIXEL_16x16); for( ; x < i_width-7; x += 8 ) SSD(PIXEL_8x16); } if( y < i_height-7 ) for( int x = 0; x < i_width-7; x += 8 ) SSD(PIXEL_8x8); #undef SSD #define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; } if( i_width & 7 ) { for( y = 0; y < (i_height & ~7); y++ ) for( int x = i_width & ~7; x < i_width; x++ ) SSD1; } if( i_height & 7 ) { for( y = i_height & ~7; y < i_height; y++ ) for( int x = 0; x < i_width; x++ ) SSD1; } #undef SSD1 return i_ssd; } static void pixel_ssd_nv12_core( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) { *ssd_u = 0, *ssd_v = 0; for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 ) for( int x = 0; x < width; x++ ) { int du = pixuv1[2*x] - pixuv2[2*x]; int dv = pixuv1[2*x+1] - pixuv2[2*x+1]; *ssd_u += du*du; *ssd_v += dv*dv; } } void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v ) { pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v ); if( i_width&7 ) { uint64_t tmp[2]; pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height, &tmp[0], &tmp[1] ); *ssd_u += tmp[0]; *ssd_v += tmp[1]; } } /**************************************************************************** * pixel_var_wxh ****************************************************************************/ #define PIXEL_VAR_C( name, w, h ) \ static uint64_t name( pixel *pix, intptr_t i_stride ) \ { \ uint32_t sum = 0, sqr = 0; \ for( int y = 0; y < h; y++ ) \ { \ for( int x = 0; x < w; x++ ) \ { \ sum += pix[x]; \ sqr += pix[x] * pix[x]; \ } \ pix += i_stride; \ } \ return sum + ((uint64_t)sqr << 32); \ } PIXEL_VAR_C( pixel_var_16x16, 16, 16 ) PIXEL_VAR_C( pixel_var_8x16, 8, 16 ) PIXEL_VAR_C( pixel_var_8x8, 8, 8 ) /**************************************************************************** * pixel_var2_wxh ****************************************************************************/ #define PIXEL_VAR2_C( name, h, shift ) \ static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \ { \ int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \ for( int y = 0; y < h; y++ ) \ { \ for( int x = 0; x < 8; x++ ) \ { \ int diff_u = fenc[x] - fdec[x]; \ int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \ sum_u += diff_u; \ sum_v += diff_v; \ sqr_u += diff_u * diff_u; \ sqr_v += diff_v * diff_v; \ } \ fenc += FENC_STRIDE; \ fdec += FDEC_STRIDE; \ } \ ssd[0] = sqr_u; \ ssd[1] = sqr_v; \ return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \ sqr_v - ((int64_t)sum_v * sum_v >> shift); \ } PIXEL_VAR2_C( pixel_var2_8x16, 16, 7 ) PIXEL_VAR2_C( pixel_var2_8x8, 8, 6 ) #if BIT_DEPTH > 8 typedef uint32_t sum_t; typedef uint64_t sum2_t; #else typedef uint16_t sum_t; typedef uint32_t sum2_t; #endif #define BITS_PER_SUM (8 * sizeof(sum_t)) #define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ sum2_t t0 = s0 + s1;\ sum2_t t1 = s0 - s1;\ sum2_t t2 = s2 + s3;\ sum2_t t3 = s2 - s3;\ d0 = t0 + t2;\ d2 = t0 - t2;\ d1 = t1 + t3;\ d3 = t1 - t3;\ } // in: a pseudo-simd number of the form x+(y<<16) // return: abs(x)+(abs(y)<<16) static ALWAYS_INLINE sum2_t abs2( sum2_t a ) { sum2_t s = ((a>>(BITS_PER_SUM-1))&(((sum2_t)1<>BITS_PER_SUM); } return sum >> 1; } static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { sum2_t tmp[4][4]; sum2_t a0, a1, a2, a3; sum2_t sum = 0; for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) { a0 = (sum2_t)(pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM); a1 = (sum2_t)(pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM); a2 = (sum2_t)(pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM); a3 = (sum2_t)(pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM); HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 ); } for( int i = 0; i < 4; i++ ) { HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] ); sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); } return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1; } #define PIXEL_SATD_C( w, h, sub )\ static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\ {\ int sum = sub( pix1, i_pix1, pix2, i_pix2 )\ + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\ if( w==16 )\ sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\ + sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\ if( h==16 )\ sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\ + sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\ if( w==16 && h==16 )\ sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\ + sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\ return sum;\ } PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 16, 8, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 8, 16, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 8, 8, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 ) PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 ) static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { sum2_t tmp[8][4]; sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; sum2_t sum = 0; for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 ) { a0 = (sum2_t)(pix1[0] - pix2[0]); a1 = (sum2_t)(pix1[1] - pix2[1]); b0 = (a0+a1) + ((a0-a1)<>BITS_PER_SUM); } return sum; } static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 ); return (sum+2)>>2; } static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 ) + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 ) + sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 ) + sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 ); return (sum+2)>>2; } static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride ) { sum2_t tmp[32]; sum2_t a0, a1, a2, a3, dc; sum2_t sum4 = 0, sum8 = 0; for( int i = 0; i < 8; i++, pix+=stride ) { sum2_t *t = tmp + (i&3) + (i&4)*4; a0 = (pix[0]+pix[1]) + ((sum2_t)(pix[0]-pix[1])<>BITS_PER_SUM) - dc; sum8 = (sum_t)sum8 + (sum8>>BITS_PER_SUM) - dc; return ((uint64_t)sum8<<32) + sum4; } #define HADAMARD_AC(w,h) \ static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\ {\ uint64_t sum = pixel_hadamard_ac( pix, stride );\ if( w==16 )\ sum += pixel_hadamard_ac( pix+8, stride );\ if( h==16 )\ sum += pixel_hadamard_ac( pix+8*stride, stride );\ if( w==16 && h==16 )\ sum += pixel_hadamard_ac( pix+8*stride+8, stride );\ return ((sum>>34)<<32) + ((uint32_t)sum>>1);\ } HADAMARD_AC( 16, 16 ) HADAMARD_AC( 16, 8 ) HADAMARD_AC( 8, 16 ) HADAMARD_AC( 8, 8 ) /**************************************************************************** * pixel_sad_x4 ****************************************************************************/ #define SAD_X( size ) \ static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\ intptr_t i_stride, int scores[3] )\ {\ scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ }\ static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\ intptr_t i_stride, int scores[4] )\ {\ scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\ } SAD_X( 16x16 ) SAD_X( 16x8 ) SAD_X( 8x16 ) SAD_X( 8x8 ) SAD_X( 8x4 ) SAD_X( 4x8 ) SAD_X( 4x4 ) /**************************************************************************** * pixel_satd_x4 * no faster than single satd, but needed for satd to be a drop-in replacement for sad ****************************************************************************/ #define SATD_X( size, cpu ) \ static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\ intptr_t i_stride, int scores[3] )\ {\ scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ }\ static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\ intptr_t i_stride, int scores[4] )\ {\ scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\ } #define SATD_X_DECL6( cpu )\ SATD_X( 16x16, cpu )\ SATD_X( 16x8, cpu )\ SATD_X( 8x16, cpu )\ SATD_X( 8x8, cpu )\ SATD_X( 8x4, cpu )\ SATD_X( 4x8, cpu ) #define SATD_X_DECL7( cpu )\ SATD_X_DECL6( cpu )\ SATD_X( 4x4, cpu ) SATD_X_DECL7() #if HAVE_MMX SATD_X_DECL7( _mmx2 ) #if !HIGH_BIT_DEPTH SATD_X_DECL6( _sse2 ) SATD_X_DECL7( _ssse3 ) SATD_X_DECL6( _ssse3_atom ) SATD_X_DECL7( _sse4 ) SATD_X_DECL7( _avx ) SATD_X_DECL7( _xop ) SATD_X_DECL7( _avx512 ) #endif // !HIGH_BIT_DEPTH #endif #if !HIGH_BIT_DEPTH #if HAVE_ARMV6 || HAVE_AARCH64 SATD_X_DECL7( _neon ) #endif #endif // !HIGH_BIT_DEPTH #define INTRA_MBCMP_8x8( mbcmp, cpu, cpu2 )\ static void intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\ {\ ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\ x264_predict_8x8_v##cpu2( pix, edge );\ res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_8x8_h##cpu2( pix, edge );\ res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_8x8_dc##cpu2( pix, edge );\ res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ } INTRA_MBCMP_8x8( sad,, _c ) INTRA_MBCMP_8x8(sa8d,, _c ) #if HIGH_BIT_DEPTH && HAVE_MMX #define x264_predict_8x8_v_sse2 x264_predict_8x8_v_sse INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif #if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || HAVE_AARCH64) INTRA_MBCMP_8x8( sad, _neon, _neon ) INTRA_MBCMP_8x8(sa8d, _neon, _neon ) #endif #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\ static void intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ {\ x264_predict_##size##chroma##_##pred1##cpu2( fdec );\ res[0] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_##size##chroma##_##pred2##cpu2( fdec );\ res[1] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_##size##chroma##_##pred3##cpu2( fdec );\ res[2] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ } INTRA_MBCMP( sad, 4x4, v, h, dc, ,, _c ) INTRA_MBCMP(satd, 4x4, v, h, dc, ,, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c,, _c ) INTRA_MBCMP(satd, 8x8, dc, h, v, c,, _c ) INTRA_MBCMP( sad, 8x16, dc, h, v, c,, _c ) INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c ) INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c ) #if HAVE_MMX #if HIGH_BIT_DEPTH #define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx #define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c #define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c #define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse #define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse #define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _mmx2 ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _sse2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _sse2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _ssse3, _sse2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _sse2 ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _sse2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _sse2 ) #else #define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 ) #endif #endif #if !HIGH_BIT_DEPTH && HAVE_ARMV6 INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _armv6 ) INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _armv6 ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) #endif #if !HIGH_BIT_DEPTH && HAVE_AARCH64 INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon ) INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _neon ) INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) #endif // No C implementation of intra_satd_x9. See checkasm for its behavior, // or see mb_analyse_intra for the entirely different algorithm we // use when lacking an asm implementation of it. /**************************************************************************** * structural similarity metric ****************************************************************************/ static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4] ) { for( int z = 0; z < 2; z++ ) { uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0; for( int y = 0; y < 4; y++ ) for( int x = 0; x < 4; x++ ) { int a = pix1[x+y*stride1]; int b = pix2[x+y*stride2]; s1 += a; s2 += b; ss += a*a; ss += b*b; s12 += a*b; } sums[z][0] = s1; sums[z][1] = s2; sums[z][2] = ss; sums[z][3] = s12; pix1 += 4; pix2 += 4; } } static float ssim_end1( int s1, int s2, int ss, int s12 ) { /* Maximum value for 10-bit is: ss*64 = (2^10-1)^2*16*4*64 = 4286582784, which will overflow in some cases. * s1*s1, s2*s2, and s1*s2 also obtain this value for edge cases: ((2^10-1)*16*4)^2 = 4286582784. * Maximum value for 9-bit is: ss*64 = (2^9-1)^2*16*4*64 = 1069551616, which will not overflow. */ #if BIT_DEPTH > 9 #define type float static const float ssim_c1 = .01*.01*PIXEL_MAX*PIXEL_MAX*64; static const float ssim_c2 = .03*.03*PIXEL_MAX*PIXEL_MAX*64*63; #else #define type int static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5); static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5); #endif type fs1 = s1; type fs2 = s2; type fss = ss; type fs12 = s12; type vars = fss*64 - fs1*fs1 - fs2*fs2; type covar = fs12*64 - fs1*fs2; return (float)(2*fs1*fs2 + ssim_c1) * (float)(2*covar + ssim_c2) / ((float)(fs1*fs1 + fs2*fs2 + ssim_c1) * (float)(vars + ssim_c2)); #undef type } static float ssim_end4( int sum0[5][4], int sum1[5][4], int width ) { float ssim = 0.0; for( int i = 0; i < width; i++ ) ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0], sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1], sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2], sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] ); return ssim; } float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int width, int height, void *buf, int *cnt ) { int z = 0; float ssim = 0.0; int (*sum0)[4] = buf; int (*sum1)[4] = sum0 + (width >> 2) + 3; width >>= 2; height >>= 2; for( int y = 1; y < height; y++ ) { for( ; z <= y; z++ ) { XCHG( void*, sum0, sum1 ); for( int x = 0; x < width; x+=2 ) pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] ); } for( int x = 0; x < width-1; x += 4 ) ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) ); } *cnt = (height-1) * (width-1); return ssim; } static int pixel_vsad( pixel *src, intptr_t stride, int height ) { int score = 0; for( int i = 1; i < height; i++, src += stride ) for( int j = 0; j < 16; j++ ) score += abs(src[j] - src[j+stride]); return score; } int x264_field_vsad( x264_t *h, int mb_x, int mb_y ) { int score_field, score_frame; int stride = h->fenc->i_stride[0]; int mb_stride = h->mb.i_mb_stride; pixel *fenc = h->fenc->plane[0] + 16 * (mb_x + mb_y * stride); int mb_xy = mb_x + mb_y*mb_stride; /* We don't want to analyze pixels outside the frame, as it gives inaccurate results. */ int mbpair_height = X264_MIN( h->param.i_height - mb_y * 16, 32 ); score_frame = h->pixf.vsad( fenc, stride, mbpair_height ); score_field = h->pixf.vsad( fenc, stride*2, mbpair_height >> 1 ); score_field += h->pixf.vsad( fenc+stride, stride*2, mbpair_height >> 1 ); if( mb_x > 0 ) score_field += 512 - h->mb.field[mb_xy -1]*1024; if( mb_y > 0 ) score_field += 512 - h->mb.field[mb_xy-mb_stride]*1024; return (score_field < score_frame); } static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ) { int sum = 0; for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 ) for( int x = 0; x < 8; x++ ) sum += pix1[x] - pix2[x]; return abs( sum ); } /**************************************************************************** * successive elimination ****************************************************************************/ static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) { int nmv = 0; for( int i = 0; i < width; i++, sums++ ) { int ads = abs( enc_dc[0] - sums[0] ) + abs( enc_dc[1] - sums[8] ) + abs( enc_dc[2] - sums[delta] ) + abs( enc_dc[3] - sums[delta+8] ) + cost_mvx[i]; if( ads < thresh ) mvs[nmv++] = i; } return nmv; } static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) { int nmv = 0; for( int i = 0; i < width; i++, sums++ ) { int ads = abs( enc_dc[0] - sums[0] ) + abs( enc_dc[1] - sums[delta] ) + cost_mvx[i]; if( ads < thresh ) mvs[nmv++] = i; } return nmv; } static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) { int nmv = 0; for( int i = 0; iname1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\ pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu; #define INIT4_NAME( name1, name2, cpu ) \ INIT2_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\ pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu; #define INIT5_NAME( name1, name2, cpu ) \ INIT4_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu; #define INIT6_NAME( name1, name2, cpu ) \ INIT5_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu; #define INIT7_NAME( name1, name2, cpu ) \ INIT6_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu; #define INIT8_NAME( name1, name2, cpu ) \ INIT7_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu; #if HAVE_SVE #define INIT7_NAME_SVE_SSD_10BIT( ) \ pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \ pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; #endif #if HAVE_SVE #define INIT8_NAME_SVE_SSD( ) \ pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_sve; \ pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_sve; \ pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; \ pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \ pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve; #define INIT8_NAME_SVE_SSD_10BIT() \ INIT7_NAME_SVE_SSD_10BIT() \ pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve; #endif #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu ) #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) #define INIT8( name, cpu ) INIT8_NAME( name, name, cpu ) #if HAVE_SVE #define INIT8_SVE_SSD( ) INIT8_NAME_SVE_SSD( ) #define INIT8_SVE_SSD_10BIT( ) INIT8_NAME_SVE_SSD_10BIT( ) #endif #define INIT_ADS( cpu ) \ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\ pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; INIT8( sad, ); INIT8_NAME( sad_aligned, sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); INIT8( ssd, ); INIT8( satd, ); INIT7( satd_x3, ); INIT7( satd_x4, ); INIT4( hadamard_ac, ); INIT_ADS( ); pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; pixf->var[PIXEL_16x16] = pixel_var_16x16; pixf->var[PIXEL_8x16] = pixel_var_8x16; pixf->var[PIXEL_8x8] = pixel_var_8x8; pixf->var2[PIXEL_8x16] = pixel_var2_8x16; pixf->var2[PIXEL_8x8] = pixel_var2_8x8; pixf->ssd_nv12_core = pixel_ssd_nv12_core; pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->vsad = pixel_vsad; pixf->asd8 = pixel_asd8; pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4; pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4; pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8; pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c; pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16; pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16; #if HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX2 ) { INIT7( sad, _mmx2 ); INIT7_NAME( sad_aligned, sad, _mmx2 ); INIT7( sad_x3, _mmx2 ); INIT7( sad_x4, _mmx2 ); INIT8( satd, _mmx2 ); INIT7( satd_x3, _mmx2 ); INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT8( ssd, _mmx2 ); pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_mmx2; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_mmx2; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_mmx2; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_mmx2; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_mmx2; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_mmx2; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2; } if( cpu&X264_CPU_SSE2 ) { INIT4_NAME( sad_aligned, sad, _sse2_aligned ); INIT5( ssd, _sse2 ); INIT6( satd, _sse2 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; #endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { INIT5( sad, _sse2 ); INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT_ADS( _sse2 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _sse2 ); } pixf->vsad = x264_pixel_vsad_sse2; pixf->asd8 = x264_pixel_asd8_sse2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_sse2; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_sse2; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse2; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_sse2; } if( cpu&X264_CPU_SSE2_IS_FAST ) { pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2; pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2; pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2; pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2; pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2; pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2; } if( cpu&X264_CPU_SSSE3 ) { INIT4_NAME( sad_aligned, sad, _ssse3_aligned ); pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_ssse3; pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_ssse3; INIT7( sad, _ssse3 ); INIT7( sad_x3, _ssse3 ); INIT7( sad_x4, _ssse3 ); INIT_ADS( _ssse3 ); INIT6( satd, _ssse3 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _ssse3 ); } pixf->vsad = x264_pixel_vsad_ssse3; pixf->asd8 = x264_pixel_asd8_ssse3; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; #endif pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_ssse3; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_ssse3; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_ssse3; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_ssse3; } if( cpu&X264_CPU_SSE4 ) { INIT6( satd, _sse4 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _sse4 ); } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; #endif pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse4; } if( cpu&X264_CPU_AVX ) { INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ INIT_ADS( _avx ); INIT6( satd, _avx ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) { INIT4( hadamard_ac, _avx ); } pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; #endif pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_avx; } if( cpu&X264_CPU_XOP ) { INIT5( sad_x3, _xop ); INIT5( sad_x4, _xop ); pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; #endif } if( cpu&X264_CPU_AVX2 ) { INIT2( ssd, _avx2 ); INIT2( sad, _avx2 ); INIT2_NAME( sad_aligned, sad, _avx2 ); INIT2( sad_x3, _avx2 ); INIT2( sad_x4, _avx2 ); INIT_ADS( _avx2 ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; pixf->vsad = x264_pixel_vsad_avx2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; } if( cpu&X264_CPU_AVX512 ) { pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; } #endif // HAVE_MMX #if HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { INIT8( sad, _neon ); INIT7( sad_x3, _neon); pixf->vsad = x264_pixel_vsad_neon; pixf->asd8 = x264_pixel_asd8_neon; INIT8(ssd, _neon); pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_neon; pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_neon; pixf->satd[PIXEL_4x4] = x264_pixel_satd_4x4_neon; pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_neon; pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_neon; pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_neon; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_neon; pixf->satd[PIXEL_16x16] = x264_pixel_satd_16x16_neon; INIT7(sad_x4, _neon); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; INIT4(hadamard_ac, _neon); pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; } #if HAVE_SVE if( cpu&X264_CPU_SVE ) { INIT8_SVE_SSD_10BIT(); } #endif #endif // HAVE_AARCH64 #else // !HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { INIT8( ssd, _mmx ); } if( cpu&X264_CPU_MMX2 ) { INIT8( sad, _mmx2 ); INIT8_NAME( sad_aligned, sad, _mmx2 ); INIT7( sad_x3, _mmx2 ); INIT7( sad_x4, _mmx2 ); INIT8( satd, _mmx2 ); INIT7( satd_x3, _mmx2 ); INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT_ADS( _mmx2 ); #if ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2; pixf->vsad = x264_pixel_vsad_mmx2; if( cpu&X264_CPU_CACHELINE_32 ) { INIT5( sad, _cache32_mmx2 ); INIT4( sad_x3, _cache32_mmx2 ); INIT4( sad_x4, _cache32_mmx2 ); } else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) { INIT5( sad, _cache64_mmx2 ); INIT4( sad_x3, _cache64_mmx2 ); INIT4( sad_x4, _cache64_mmx2 ); } #else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) ) { pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2; pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2; pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmx2; pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2; pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmx2; pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2; pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmx2; } #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_mmx2; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_mmx2; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2; } if( cpu&X264_CPU_SSE2 ) { INIT5( ssd, _sse2slow ); INIT2_NAME( sad_aligned, sad, _sse2_aligned ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2; #endif pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; pixf->vsad = x264_pixel_vsad_sse2; pixf->asd8 = x264_pixel_asd8_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { INIT2( sad, _sse2 ); INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT6( satd, _sse2 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2; INIT6( satd_x3, _sse2 ); INIT6( satd_x4, _sse2 ); INIT4( hadamard_ac, _sse2 ); INIT_ADS( _sse2 ); pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse2; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_sse2; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( ssd, _sse2); /* faster for width 16 on p4 */ #if ARCH_X86 INIT2( sad, _cache64_sse2 ); INIT2( sad_x3, _cache64_sse2 ); INIT2( sad_x4, _cache64_sse2 ); #endif if( cpu&X264_CPU_SSE2_IS_FAST ) { pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2; pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2; } } } if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) ) { pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2; pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2; pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2; pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2; pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2; pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2; pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2; } if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) ) { INIT2( sad, _sse3 ); INIT2( sad_x3, _sse3 ); INIT2( sad_x4, _sse3 ); } if( cpu&X264_CPU_SSSE3 ) { INIT4( hadamard_ac, _ssse3 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3; #if ARCH_X86_64 pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3; #endif } INIT_ADS( _ssse3 ); if( cpu&X264_CPU_SLOW_ATOM ) { pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom; INIT6( satd, _ssse3_atom ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom; INIT6( satd_x3, _ssse3_atom ); INIT6( satd_x4, _ssse3_atom ); INIT4( hadamard_ac, _ssse3_atom ); #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom; #endif } else { INIT8( ssd, _ssse3 ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; INIT8( satd, _ssse3 ); INIT7( satd_x3, _ssse3 ); INIT7( satd_x4, _ssse3 ); #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3; #endif } pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_ssse3; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3; pixf->asd8 = x264_pixel_asd8_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_ssse3 ); INIT2( sad_x3, _cache64_ssse3 ); INIT2( sad_x4, _cache64_ssse3 ); } else { INIT2( sad_x3, _ssse3 ); INIT5( sad_x4, _ssse3 ); } if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) ) { INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */ } } if( cpu&X264_CPU_SSE4 ) { INIT8( satd, _sse4 ); INIT7( satd_x3, _sse4 ); INIT7( satd_x4, _sse4 ); INIT4( hadamard_ac, _sse4 ); if( !(cpu&X264_CPU_STACK_MOD4) ) { pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4; #if ARCH_X86_64 pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_sse4; #endif } pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_sse4; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4; #endif } if( cpu&X264_CPU_AVX ) { INIT2_NAME( sad_aligned, sad, _sse2 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ INIT2( sad_x3, _avx ); INIT2( sad_x4, _avx ); INIT8( satd, _avx ); INIT7( satd_x3, _avx ); INIT7( satd_x4, _avx ); INIT_ADS( _avx ); INIT4( hadamard_ac, _avx ); if( !(cpu&X264_CPU_STACK_MOD4) ) { pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx; pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx; #if ARCH_X86_64 pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_avx; #endif } INIT5( ssd, _avx ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_avx; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx; pixf->ssim_end4 = x264_pixel_ssim_end4_avx; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx; #endif } if( cpu&X264_CPU_XOP ) { INIT7( satd, _xop ); INIT7( satd_x3, _xop ); INIT7( satd_x4, _xop ); INIT4( hadamard_ac, _xop ); if( !(cpu&X264_CPU_STACK_MOD4) ) { pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop; } INIT5( ssd, _xop ); pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_xop; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; #endif } if( cpu&X264_CPU_AVX2 ) { INIT2( ssd, _avx2 ); INIT2( sad_x3, _avx2 ); INIT2( sad_x4, _avx2 ); INIT4( satd, _avx2 ); INIT2( hadamard_ac, _avx2 ); INIT_ADS( _avx2 ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2; pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_avx2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2; #endif } if( cpu&X264_CPU_AVX512 ) { INIT8( sad, _avx512 ); INIT8_NAME( sad_aligned, sad, _avx512 ); INIT7( sad_x3, _avx512 ); INIT7( sad_x4, _avx512 ); INIT8( satd, _avx512 ); INIT7( satd_x3, _avx512 ); INIT7( satd_x4, _avx512 ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; } #endif //HAVE_MMX #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) { pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6; pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6; pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6; pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6; } if( cpu&X264_CPU_NEON ) { INIT5( sad, _neon ); INIT5( sad_aligned, _neon ); INIT7( sad_x3, _neon ); INIT7( sad_x4, _neon ); INIT7( ssd, _neon ); INIT7( satd, _neon ); INIT7( satd_x3, _neon ); INIT7( satd_x4, _neon ); INIT4( hadamard_ac, _neon ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; pixf->vsad = x264_pixel_vsad_neon; pixf->asd8 = x264_pixel_asd8_neon; pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_neon; pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_neon; pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_neon; pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_neon; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_neon; pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_neon; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_neon; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_neon; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_neon; pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_neon; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; if( cpu&X264_CPU_FAST_NEON_MRC ) { pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon; pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon; pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon; pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon; } else // really just scheduled for dual issue / A8 { INIT5( sad_aligned, _neon_dual ); } } #endif #if HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { INIT8( sad, _neon ); // AArch64 has no distinct instructions for aligned load/store INIT8_NAME( sad_aligned, sad, _neon ); INIT7( sad_x3, _neon ); INIT7( sad_x4, _neon ); INIT8( ssd, _neon ); INIT8( satd, _neon ); INIT7( satd_x3, _neon ); INIT7( satd_x4, _neon ); INIT4( hadamard_ac, _neon ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; pixf->vsad = x264_pixel_vsad_neon; pixf->asd8 = x264_pixel_asd8_neon; pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_neon; pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_neon; pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_neon; pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_neon; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_neon; pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_neon; pixf->intra_sad_x3_8x16c = intra_sad_x3_8x16c_neon; pixf->intra_satd_x3_8x16c = intra_satd_x3_8x16c_neon; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_neon; pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_neon; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; } #if HAVE_DOTPROD if( cpu&X264_CPU_DOTPROD ) { pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_neon_dotprod; pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_neon_dotprod; pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_neon_dotprod; pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_neon_dotprod; pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_neon_dotprod; pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_neon_dotprod; pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_neon_dotprod; pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_neon_dotprod; pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_neon_dotprod; pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_neon_dotprod; pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_neon_dotprod; pixf->vsad = x264_pixel_vsad_neon_dotprod; } #endif // HAVE_DOTPROD #if HAVE_SVE if( cpu&X264_CPU_SVE ) { INIT8_SVE_SSD( ); INIT4( hadamard_ac, _sve ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sve; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sve; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sve; } #endif #endif // HAVE_AARCH64 #if HAVE_MSA if( cpu&X264_CPU_MSA ) { INIT8( sad, _msa ); INIT8_NAME( sad_aligned, sad, _msa ); INIT8( ssd, _msa ); INIT7( sad_x3, _msa ); INIT7( sad_x4, _msa ); INIT8( satd, _msa ); INIT4( hadamard_ac, _msa ); pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_msa; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_msa; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_msa; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_msa; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_msa; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_msa; pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa; //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_msa; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_msa; } #endif // HAVE_MSA #if HAVE_LSX if( cpu&X264_CPU_LSX ) { INIT8( sad, _lsx ); INIT8_NAME( sad_aligned, sad, _lsx ); INIT8( ssd, _lsx ); INIT7( sad_x3, _lsx ); INIT7( sad_x4, _lsx ); INIT8( satd, _lsx ); INIT4( hadamard_ac, _lsx ); pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_lsx; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_lsx; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_lsx; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_lsx; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_lsx; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_lsx; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_lsx; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_lsx; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_lsx; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_lsx; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_lsx; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_lsx; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_lsx; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_lsx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_lsx; } if( cpu&X264_CPU_LASX ) { INIT4( ssd, _lasx ); INIT4( hadamard_ac, _lasx ); pixf->satd[PIXEL_16x16] = x264_pixel_satd_16x16_lasx; pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_lasx; pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_lasx; pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_lasx; pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_lasx; pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_lasx; pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_lasx; pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_lasx; pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_lasx; pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_lasx; pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_lasx; pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_lasx; pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_lasx; pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_lasx; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_lasx; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_lasx; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_lasx; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_lasx; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_lasx; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_lasx; } #endif /* HAVE_LSX */ #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { x264_pixel_init_altivec( pixf ); } #endif pixf->ads[PIXEL_8x16] = pixf->ads[PIXEL_8x4] = pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8]; pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8]; } x264-master/common/pixel.h000066400000000000000000000170631502133446700156500ustar00rootroot00000000000000/***************************************************************************** * pixel.c: pixel metrics ***************************************************************************** * Copyright (C) 2004-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PIXEL_H #define X264_PIXEL_H // SSD assumes all args aligned // other cmp functions assume first arg aligned typedef int (*x264_pixel_cmp_t)( pixel *, intptr_t, pixel *, intptr_t ); typedef void (*x264_pixel_cmp_x3_t)( pixel *, pixel *, pixel *, pixel *, intptr_t, int[3] ); typedef void (*x264_pixel_cmp_x4_t)( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int[4] ); enum { PIXEL_16x16 = 0, PIXEL_16x8 = 1, PIXEL_8x16 = 2, PIXEL_8x8 = 3, PIXEL_8x4 = 4, PIXEL_4x8 = 5, PIXEL_4x4 = 6, /* Subsampled chroma only */ PIXEL_4x16 = 7, /* 4:2:2 */ PIXEL_4x2 = 8, PIXEL_2x8 = 9, /* 4:2:2 */ PIXEL_2x4 = 10, PIXEL_2x2 = 11, }; static const struct { uint8_t w, h; } x264_pixel_size[12] = { { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 16 }, { 4, 2 }, { 2, 8 }, { 2, 4 }, { 2, 2 }, }; static const uint8_t x264_size2pixel[5][5] = { { 0, }, { 0, PIXEL_4x4, PIXEL_8x4, 0, 0 }, { 0, PIXEL_4x8, PIXEL_8x8, 0, PIXEL_16x8 }, { 0, }, { 0, 0, PIXEL_8x16, 0, PIXEL_16x16 } }; static const uint8_t x264_luma2chroma_pixel[4][7] = { { 0 }, { PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */ { PIXEL_8x16, PIXEL_8x8, PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */ { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */ }; typedef struct { x264_pixel_cmp_t sad[8]; x264_pixel_cmp_t ssd[8]; x264_pixel_cmp_t satd[8]; x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */ x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */ x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */ int (*vsad)( pixel *, intptr_t, int ); int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); uint64_t (*var[4])( pixel *pix, intptr_t stride ); int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] ); uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride ); void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); void (*ssim_4x4x2_core)( const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4] ); float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); /* multiple parallel calls to cmp. */ x264_pixel_cmp_x3_t sad_x3[7]; x264_pixel_cmp_x4_t sad_x4[7]; x264_pixel_cmp_x3_t satd_x3[7]; x264_pixel_cmp_x4_t satd_x4[7]; /* abs-diff-sum for successive elimination. * may round width up to a multiple of 16. */ int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); /* calculate satd or sad of V, H, and DC modes. */ void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); /* find minimum satd or sad of all modes, and set fdec. * may be NULL, in which case just use pred+satd instead. */ int (*intra_mbcmp_x9_4x4)( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); int (*intra_satd_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); int (*intra_sad_x9_4x4) ( pixel *fenc, pixel *fdec, uint16_t *bitcosts ); int (*intra_mbcmp_x9_8x8)( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds ); int (*intra_sa8d_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds ); int (*intra_sad_x9_8x8) ( pixel *fenc, pixel *fdec, pixel edge[36], uint16_t *bitcosts, uint16_t *satds ); } x264_pixel_function_t; #define x264_pixel_init x264_template(pixel_init) void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ); #define x264_pixel_ssd_nv12 x264_template(pixel_ssd_nv12) void x264_pixel_ssd_nv12 ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v ); #define x264_pixel_ssd_wxh x264_template(pixel_ssd_wxh) uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, int i_width, int i_height ); #define x264_pixel_ssim_wxh x264_template(pixel_ssim_wxh) float x264_pixel_ssim_wxh ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, int i_width, int i_height, void *buf, int *cnt ); #define x264_field_vsad x264_template(field_vsad) int x264_field_vsad( x264_t *h, int mb_x, int mb_y ); #endif x264-master/common/ppc/000077500000000000000000000000001502133446700151315ustar00rootroot00000000000000x264-master/common/ppc/dct.c000066400000000000000000001044771502133446700160640ustar00rootroot00000000000000/***************************************************************************** * dct.c: ppc transform and zigzag ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Guillaume Poirier * Eric Petit * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "ppccommon.h" #include "dct.h" #if !HIGH_BIT_DEPTH #define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \ b1 = vec_add( a0, a3 ); \ b3 = vec_add( a1, a2 ); \ b0 = vec_add( b1, b3 ); \ b2 = vec_sub( b1, b3 ); \ a0 = vec_sub( a0, a3 ); \ a1 = vec_sub( a1, a2 ); \ b1 = vec_add( a0, a0 ); \ b1 = vec_add( b1, a1 ); \ b3 = vec_sub( a0, a1 ); \ b3 = vec_sub( b3, a1 ) void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ) { PREP_DIFF_8BYTEALIGNED; vec_s16_t dct0v, dct1v, dct2v, dct3v; vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v; vec_u8_t permHighv; VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v ); VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v ); VEC_TRANSPOSE_4( tmp0v, tmp1v, tmp2v, tmp3v, dct0v, dct1v, dct2v, dct3v ); permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17); VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v ); vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct); vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct); } void x264_sub8x8_dct_altivec( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ) { PREP_DIFF_8BYTEALIGNED; vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v; vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v; vec_u8_t permHighv, permLowv; VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v ); VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v ); VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v ); VEC_TRANSPOSE_8( tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v, dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ); permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17); permLowv = (vec_u8_t) CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v ); VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v ); vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, *dct); vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, *dct); vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32, *dct); vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48, *dct); vec_st(vec_perm(tmp0v, tmp1v, permLowv), 64, *dct); vec_st(vec_perm(tmp2v, tmp3v, permLowv), 80, *dct); vec_st(vec_perm(tmp4v, tmp5v, permLowv), 96, *dct); vec_st(vec_perm(tmp6v, tmp7v, permLowv), 112, *dct); } void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ) { x264_sub8x8_dct_altivec( &dct[ 0], &pix1[0], &pix2[0] ); x264_sub8x8_dct_altivec( &dct[ 4], &pix1[8], &pix2[8] ); x264_sub8x8_dct_altivec( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); x264_sub8x8_dct_altivec( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); } /*************************************************************************** * 8x8 transform: ***************************************************************************/ static void pix_diff( uint8_t *p1, uint8_t *p2, vec_s16_t *diff, int i ) { vec_s16_t pix1v, pix2v, tmp[4]; vec_u8_t pix1v8, pix2v8; LOAD_ZERO; for( int j = 0; j < 4; j++ ) { pix1v8 = vec_vsx_ld( 0, p1 ); pix2v8 = vec_vsx_ld( 0, p2 ); pix1v = vec_u8_to_s16_h( pix1v8 ); pix2v = vec_u8_to_s16_h( pix2v8 ); tmp[j] = vec_sub( pix1v, pix2v ); p1 += FENC_STRIDE; p2 += FDEC_STRIDE; } diff[i] = vec_add( tmp[0], tmp[1] ); diff[i] = vec_add( diff[i], tmp[2] ); diff[i] = vec_add( diff[i], tmp[3] ); } void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ) { vec_s16_t diff[2], tmp; vec_s32_t sum[2]; vec_s32_t zero32 = vec_splat_s32(0); vec_u8_t mask = { 0x00, 0x01, 0x00, 0x01, 0x04, 0x05, 0x04, 0x05, 0x02, 0x03, 0x02, 0x03, 0x06, 0x07, 0x06, 0x07 }; pix_diff( &pix1[0], &pix2[0], diff, 0 ); pix_diff( &pix1[4*FENC_STRIDE], &pix2[4*FDEC_STRIDE], diff, 1 ); sum[0] = vec_sum4s( diff[0], zero32 ); sum[1] = vec_sum4s( diff[1], zero32 ); diff[0] = vec_packs( sum[0], sum[1] ); sum[0] = vec_sum4s( diff[0], zero32 ); diff[0] = vec_packs( sum[0], zero32 ); diff[0] = vec_perm( diff[0], diff[0], mask ); // 0 0 2 2 1 1 3 3 tmp = xxpermdi( diff[0], diff[0], 2 ); // 1 1 3 3 0 0 2 2 diff[1] = vec_add( diff[0], tmp ); // 0+1 0+1 2+3 2+3 diff[0] = vec_sub( diff[0], tmp ); // 0-1 0-1 2-3 2-3 tmp = vec_mergeh( diff[1], diff[0] ); // 0+1 0-1 0+1 0-1 2+3 2-3 2+3 2-3 diff[0] = xxpermdi( tmp, tmp, 2 ); // 2+3 2-3 2+3 2-3 diff[1] = vec_add( tmp, diff[0] ); // 0+1+2+3 0-1+2+3 diff[0] = vec_sub( tmp, diff[0] ); // 0+1-2-3 0-1-2+3 diff[0] = vec_mergeh( diff[1], diff[0] ); diff[1] = vec_ld( 0, dct ); diff[0] = xxpermdi( diff[0], diff[1], 0 ); vec_st( diff[0], 0, dct ); } /* DCT8_1D unrolled by 8 in Altivec */ #define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \ { \ /* int s07 = SRC(0) + SRC(7); */ \ vec_s16_t s07v = vec_add( dct0v, dct7v); \ /* int s16 = SRC(1) + SRC(6); */ \ vec_s16_t s16v = vec_add( dct1v, dct6v); \ /* int s25 = SRC(2) + SRC(5); */ \ vec_s16_t s25v = vec_add( dct2v, dct5v); \ /* int s34 = SRC(3) + SRC(4); */ \ vec_s16_t s34v = vec_add( dct3v, dct4v); \ \ /* int a0 = s07 + s34; */ \ vec_s16_t a0v = vec_add(s07v, s34v); \ /* int a1 = s16 + s25; */ \ vec_s16_t a1v = vec_add(s16v, s25v); \ /* int a2 = s07 - s34; */ \ vec_s16_t a2v = vec_sub(s07v, s34v); \ /* int a3 = s16 - s25; */ \ vec_s16_t a3v = vec_sub(s16v, s25v); \ \ /* int d07 = SRC(0) - SRC(7); */ \ vec_s16_t d07v = vec_sub( dct0v, dct7v); \ /* int d16 = SRC(1) - SRC(6); */ \ vec_s16_t d16v = vec_sub( dct1v, dct6v); \ /* int d25 = SRC(2) - SRC(5); */ \ vec_s16_t d25v = vec_sub( dct2v, dct5v); \ /* int d34 = SRC(3) - SRC(4); */ \ vec_s16_t d34v = vec_sub( dct3v, dct4v); \ \ /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \ vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\ /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \ vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\ /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \ vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\ /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \ vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\ \ /* DST(0) = a0 + a1; */ \ dct0v = vec_add( a0v, a1v ); \ /* DST(1) = a4 + (a7>>2); */ \ dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \ /* DST(2) = a2 + (a3>>1); */ \ dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \ /* DST(3) = a5 + (a6>>2); */ \ dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \ /* DST(4) = a0 - a1; */ \ dct4v = vec_sub( a0v, a1v ); \ /* DST(5) = a6 - (a5>>2); */ \ dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \ /* DST(6) = (a2>>1) - a3 ; */ \ dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \ /* DST(7) = (a4>>2) - a7 ; */ \ dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \ } void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ) { vec_u16_t onev = vec_splat_u16(1); vec_u16_t twov = vec_add( onev, onev ); PREP_DIFF_8BYTEALIGNED; vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v; VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v ); VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v ); DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ); vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v; VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v, dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v ); DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v, dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v ); vec_st( dct_tr0v, 0, dct ); vec_st( dct_tr1v, 16, dct ); vec_st( dct_tr2v, 32, dct ); vec_st( dct_tr3v, 48, dct ); vec_st( dct_tr4v, 64, dct ); vec_st( dct_tr5v, 80, dct ); vec_st( dct_tr6v, 96, dct ); vec_st( dct_tr7v, 112, dct ); } void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ) { x264_sub8x8_dct8_altivec( dct[0], &pix1[0], &pix2[0] ); x264_sub8x8_dct8_altivec( dct[1], &pix1[8], &pix2[8] ); x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] ); x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] ); } /**************************************************************************** * IDCT transform: ****************************************************************************/ #define ALTIVEC_STORE8_DC_SUM_CLIP(dest, dcv) \ { \ /* unaligned load */ \ vec_u8_t dstv = vec_vsx_ld( 0, dest ); \ vec_s16_t dcvsum = vec_adds( dcv, vec_u8_to_s16_h( dstv ) ); \ vec_u8_t dcvsum8 = vec_packsu( dcvsum, vec_u8_to_s16_l( dstv ) ); \ /* unaligned store */ \ vec_vsx_st( dcvsum8, 0, dest ); \ } void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] ) { vec_s16_t dcv0, dcv1; vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) ); vec_u16_t v6 = vec_splat_u16( 6 ); vec_s16_t dctv = vec_ld( 0, dct ); vec_u8_t dstv0, dstv1, dstv2, dstv3, dstv4, dstv5, dstv6, dstv7; vec_s16_t dcvsum0, dcvsum1, dcvsum2, dcvsum3, dcvsum4, dcvsum5, dcvsum6, dcvsum7; vec_u8_t dcvsum8_0, dcvsum8_1, dcvsum8_2, dcvsum8_3, dcvsum8_4, dcvsum8_5, dcvsum8_6, dcvsum8_7; LOAD_ZERO; dctv = vec_sra( vec_add( dctv, v32 ), v6 ); dcv1 = (vec_s16_t)vec_mergeh( dctv, dctv ); dcv0 = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv1, (vec_s32_t)dcv1 ); dcv1 = (vec_s16_t)vec_mergel( (vec_s32_t)dcv1, (vec_s32_t)dcv1 ); dstv0 = vec_vsx_ld( 0, p_dst ); dstv4 = vec_vsx_ld( 0, p_dst + 4*FDEC_STRIDE ); dstv1 = vec_vsx_ld( 0, p_dst + 1*FDEC_STRIDE ); dstv5 = vec_vsx_ld( 0, p_dst + 4*FDEC_STRIDE + 1*FDEC_STRIDE ); dstv2 = vec_vsx_ld( 0, p_dst + 2*FDEC_STRIDE); dstv6 = vec_vsx_ld( 0, p_dst + 4*FDEC_STRIDE + 2*FDEC_STRIDE ); dstv3 = vec_vsx_ld( 0, p_dst + 3*FDEC_STRIDE); dstv7 = vec_vsx_ld( 0, p_dst + 4*FDEC_STRIDE + 3*FDEC_STRIDE ); vec_s16_t s0 = vec_u8_to_s16_h( dstv0 ); vec_s16_t s1 = vec_u8_to_s16_h( dstv4 ); vec_s16_t s2 = vec_u8_to_s16_h( dstv1 ); vec_s16_t s3 = vec_u8_to_s16_h( dstv5 ); vec_s16_t s4 = vec_u8_to_s16_h( dstv2 ); vec_s16_t s5 = vec_u8_to_s16_h( dstv6 ); vec_s16_t s6 = vec_u8_to_s16_h( dstv3 ); vec_s16_t s7 = vec_u8_to_s16_h( dstv7 ); dcvsum0 = vec_adds( dcv0, s0 ); dcvsum4 = vec_adds( dcv1, s1 ); dcvsum1 = vec_adds( dcv0, s2 ); dcvsum5 = vec_adds( dcv1, s3 ); dcvsum2 = vec_adds( dcv0, s4 ); dcvsum6 = vec_adds( dcv1, s5 ); dcvsum3 = vec_adds( dcv0, s6 ); dcvsum7 = vec_adds( dcv1, s7 ); dcvsum8_0 = vec_packsu( dcvsum0, vec_u8_to_s16_l( dstv0 ) ); dcvsum8_1 = vec_packsu( dcvsum1, vec_u8_to_s16_l( dstv1 ) ); dcvsum8_2 = vec_packsu( dcvsum2, vec_u8_to_s16_l( dstv2 ) ); dcvsum8_3 = vec_packsu( dcvsum3, vec_u8_to_s16_l( dstv3 ) ); dcvsum8_4 = vec_packsu( dcvsum4, vec_u8_to_s16_l( dstv4 ) ); dcvsum8_5 = vec_packsu( dcvsum5, vec_u8_to_s16_l( dstv5 ) ); dcvsum8_6 = vec_packsu( dcvsum6, vec_u8_to_s16_l( dstv6 ) ); dcvsum8_7 = vec_packsu( dcvsum7, vec_u8_to_s16_l( dstv7 ) ); vec_vsx_st( dcvsum8_0, 0, p_dst ); vec_vsx_st( dcvsum8_4, 0, p_dst + 4*FDEC_STRIDE ); vec_vsx_st( dcvsum8_1, 0, p_dst + 1*FDEC_STRIDE ); vec_vsx_st( dcvsum8_5, 0, p_dst + 4*FDEC_STRIDE + 1*FDEC_STRIDE ); vec_vsx_st( dcvsum8_2, 0, p_dst + 2*FDEC_STRIDE ); vec_vsx_st( dcvsum8_6, 0, p_dst + 4*FDEC_STRIDE + 2*FDEC_STRIDE ); vec_vsx_st( dcvsum8_3, 0, p_dst + 3*FDEC_STRIDE ); vec_vsx_st( dcvsum8_7, 0, p_dst + 4*FDEC_STRIDE + 3*FDEC_STRIDE ); } #define LOAD16 \ dstv0 = vec_ld( 0, p_dst ); \ dstv1 = vec_ld( 0, p_dst + 1*FDEC_STRIDE ); \ dstv2 = vec_ld( 0, p_dst + 2*FDEC_STRIDE ); \ dstv3 = vec_ld( 0, p_dst + 3*FDEC_STRIDE ); #define SUM16 \ dcvsum0 = vec_adds( dcv0, vec_u8_to_s16_h( dstv0 ) ); \ dcvsum4 = vec_adds( dcv1, vec_u8_to_s16_l( dstv0 ) ); \ dcvsum1 = vec_adds( dcv0, vec_u8_to_s16_h( dstv1 ) ); \ dcvsum5 = vec_adds( dcv1, vec_u8_to_s16_l( dstv1 ) ); \ dcvsum2 = vec_adds( dcv0, vec_u8_to_s16_h( dstv2 ) ); \ dcvsum6 = vec_adds( dcv1, vec_u8_to_s16_l( dstv2 ) ); \ dcvsum3 = vec_adds( dcv0, vec_u8_to_s16_h( dstv3 ) ); \ dcvsum7 = vec_adds( dcv1, vec_u8_to_s16_l( dstv3 ) ); \ dcvsum8_0 = vec_packsu( dcvsum0, dcvsum4 ); \ dcvsum8_1 = vec_packsu( dcvsum1, dcvsum5 ); \ dcvsum8_2 = vec_packsu( dcvsum2, dcvsum6 ); \ dcvsum8_3 = vec_packsu( dcvsum3, dcvsum7 ); #define STORE16 \ vec_st( dcvsum8_0, 0, p_dst ); \ vec_st( dcvsum8_1, 0, p_dst + 1*FDEC_STRIDE ); \ vec_st( dcvsum8_2, 0, p_dst + 2*FDEC_STRIDE ); \ vec_st( dcvsum8_3, 0, p_dst + 3*FDEC_STRIDE ); void x264_add16x16_idct_dc_altivec( uint8_t *p_dst, int16_t dct[16] ) { vec_s16_t dcv0, dcv1; vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) ); vec_u16_t v6 = vec_splat_u16( 6 ); vec_u8_t dstv0, dstv1, dstv2, dstv3; vec_s16_t dcvsum0, dcvsum1, dcvsum2, dcvsum3, dcvsum4, dcvsum5, dcvsum6, dcvsum7; vec_u8_t dcvsum8_0, dcvsum8_1, dcvsum8_2, dcvsum8_3; LOAD_ZERO; for( int i = 0; i < 2; i++ ) { vec_s16_t dctv = vec_ld( 0, dct ); dctv = vec_sra( vec_add( dctv, v32 ), v6 ); dcv1 = (vec_s16_t)vec_mergeh( dctv, dctv ); dcv0 = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv1, (vec_s32_t)dcv1 ); dcv1 = (vec_s16_t)vec_mergel( (vec_s32_t)dcv1, (vec_s32_t)dcv1 ); LOAD16; SUM16; STORE16; p_dst += 4*FDEC_STRIDE; dcv1 = (vec_s16_t)vec_mergel( dctv, dctv ); dcv0 = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv1, (vec_s32_t)dcv1 ); dcv1 = (vec_s16_t)vec_mergel( (vec_s32_t)dcv1, (vec_s32_t)dcv1 ); LOAD16; SUM16; STORE16; dct += 8; p_dst += 4*FDEC_STRIDE; } } #define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \ { \ /* a0 = SRC(0) + SRC(2); */ \ vec_s16_t a0v = vec_add(s0, s2); \ /* a1 = SRC(0) - SRC(2); */ \ vec_s16_t a1v = vec_sub(s0, s2); \ /* a2 = (SRC(1)>>1) - SRC(3); */ \ vec_s16_t a2v = vec_sub(vec_sra(s1, onev), s3); \ /* a3 = (SRC(3)>>1) + SRC(1); */ \ vec_s16_t a3v = vec_add(vec_sra(s3, onev), s1); \ /* DST(0, a0 + a3); */ \ d0 = vec_add(a0v, a3v); \ /* DST(1, a1 + a2); */ \ d1 = vec_add(a1v, a2v); \ /* DST(2, a1 - a2); */ \ d2 = vec_sub(a1v, a2v); \ /* DST(3, a0 - a3); */ \ d3 = vec_sub(a0v, a3v); \ } #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ vdst_orig = vec_ld(0, dst); \ vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst); \ va = vec_add(va, vdst_ss); \ va_u8 = vec_s16_to_u8(va); \ va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ vec_ste(va_u32, element, (uint32_t*)dst); #define ALTIVEC_STORE4_SUM_CLIP(dest, idctv) \ { \ /* unaligned load */ \ vec_u8_t dstv = vec_vsx_ld(0, dest); \ vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \ vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ vec_u8_t idstsum8 = vec_s16_to_u8(idstsum); \ /* unaligned store */ \ vec_u32_t bodyv = vec_splat((vec_u32_t)idstsum8, 0); \ int element = ((unsigned long)dest & 0xf) >> 2; \ vec_ste(bodyv, element, (uint32_t *)dest); \ } void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[16] ) { vec_u16_t onev = vec_splat_u16(1); dct[0] += 32; // rounding for the >>6 at the end vec_s16_t s0, s1, s2, s3; s0 = vec_ld( 0x00, dct ); s1 = vec_sld( s0, s0, 8 ); s2 = vec_ld( 0x10, dct ); s3 = vec_sld( s2, s2, 8 ); vec_s16_t d0, d1, d2, d3; IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 ); vec_s16_t tr0, tr1, tr2, tr3; VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 ); vec_s16_t idct0, idct1, idct2, idct3; IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 ); vec_u16_t sixv = vec_splat_u16(6); LOAD_ZERO; ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0 ); ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1 ); ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2 ); ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3 ); } void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] ) { x264_add4x4_idct_altivec( &p_dst[0], dct[0] ); x264_add4x4_idct_altivec( &p_dst[4], dct[1] ); x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2] ); x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+4], dct[3] ); } void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] ) { x264_add8x8_idct_altivec( &p_dst[0], &dct[0] ); x264_add8x8_idct_altivec( &p_dst[8], &dct[4] ); x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+0], &dct[8] ); x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+8], &dct[12] ); } #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7)\ {\ /* a0 = SRC(0) + SRC(4); */ \ vec_s16_t a0v = vec_add(s0, s4); \ /* a2 = SRC(0) - SRC(4); */ \ vec_s16_t a2v = vec_sub(s0, s4); \ /* a4 = (SRC(2)>>1) - SRC(6); */ \ vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ /* a6 = (SRC(6)>>1) + SRC(2); */ \ vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ /* b0 = a0 + a6; */ \ vec_s16_t b0v = vec_add(a0v, a6v); \ /* b2 = a2 + a4; */ \ vec_s16_t b2v = vec_add(a2v, a4v); \ /* b4 = a2 - a4; */ \ vec_s16_t b4v = vec_sub(a2v, a4v); \ /* b6 = a0 - a6; */ \ vec_s16_t b6v = vec_sub(a0v, a6v); \ /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) );\ /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ /* b1 = (a7>>2) + a1; */ \ vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ /* b3 = a3 + (a5>>2); */ \ vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ /* b5 = (a3>>2) - a5; */ \ vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ /* b7 = a7 - (a1>>2); */ \ vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ /* DST(0, b0 + b7); */ \ d0 = vec_add(b0v, b7v); \ /* DST(1, b2 + b5); */ \ d1 = vec_add(b2v, b5v); \ /* DST(2, b4 + b3); */ \ d2 = vec_add(b4v, b3v); \ /* DST(3, b6 + b1); */ \ d3 = vec_add(b6v, b1v); \ /* DST(4, b6 - b1); */ \ d4 = vec_sub(b6v, b1v); \ /* DST(5, b4 - b3); */ \ d5 = vec_sub(b4v, b3v); \ /* DST(6, b2 - b5); */ \ d6 = vec_sub(b2v, b5v); \ /* DST(7, b0 - b7); */ \ d7 = vec_sub(b0v, b7v); \ } #define ALTIVEC_STORE_SUM_CLIP(dest, idctv) \ { \ vec_s16_t idct_sh6 = vec_sra( idctv, sixv ); \ /* unaligned load */ \ vec_u8_t dstv = vec_vsx_ld( 0, dest ); \ vec_s16_t idstsum = vec_adds( idct_sh6, vec_u8_to_s16_h( dstv ) ); \ vec_u8_t idstsum8 = vec_packsu( idstsum, vec_u8_to_s16_l( dstv ) ); \ /* unaligned store */ \ vec_vsx_st( idstsum8, 0, dest ); \ } void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] ) { vec_u16_t onev = vec_splat_u16(1); vec_u16_t twov = vec_splat_u16(2); dct[0] += 32; // rounding for the >>6 at the end vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; s0 = vec_ld(0x00, dct); s1 = vec_ld(0x10, dct); s2 = vec_ld(0x20, dct); s3 = vec_ld(0x30, dct); s4 = vec_ld(0x40, dct); s5 = vec_ld(0x50, dct); s6 = vec_ld(0x60, dct); s7 = vec_ld(0x70, dct); vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7); vec_s16_t tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7; VEC_TRANSPOSE_8( d0, d1, d2, d3, d4, d5, d6, d7, tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7); vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; IDCT8_1D_ALTIVEC(tr0, tr1, tr2, tr3, tr4, tr5, tr6, tr7, idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); vec_u16_t sixv = vec_splat_u16(6); LOAD_ZERO; ALTIVEC_STORE_SUM_CLIP(&dst[0*FDEC_STRIDE], idct0); ALTIVEC_STORE_SUM_CLIP(&dst[1*FDEC_STRIDE], idct1); ALTIVEC_STORE_SUM_CLIP(&dst[2*FDEC_STRIDE], idct2); ALTIVEC_STORE_SUM_CLIP(&dst[3*FDEC_STRIDE], idct3); ALTIVEC_STORE_SUM_CLIP(&dst[4*FDEC_STRIDE], idct4); ALTIVEC_STORE_SUM_CLIP(&dst[5*FDEC_STRIDE], idct5); ALTIVEC_STORE_SUM_CLIP(&dst[6*FDEC_STRIDE], idct6); ALTIVEC_STORE_SUM_CLIP(&dst[7*FDEC_STRIDE], idct7); } void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] ) { x264_add8x8_idct8_altivec( &dst[0], dct[0] ); x264_add8x8_idct8_altivec( &dst[8], dct[1] ); x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] ); x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] ); } void x264_zigzag_scan_4x4_frame_altivec( int16_t level[16], int16_t dct[16] ) { vec_s16_t dct0v, dct1v; vec_s16_t tmp0v, tmp1v; dct0v = vec_ld(0x00, dct); dct1v = vec_ld(0x10, dct); const vec_u8_t sel0 = (vec_u8_t) CV(0,1,8,9,2,3,4,5,10,11,16,17,24,25,18,19); const vec_u8_t sel1 = (vec_u8_t) CV(12,13,6,7,14,15,20,21,26,27,28,29,22,23,30,31); tmp0v = vec_perm( dct0v, dct1v, sel0 ); tmp1v = vec_perm( dct0v, dct1v, sel1 ); vec_st( tmp0v, 0x00, level ); vec_st( tmp1v, 0x10, level ); } void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[16] ) { vec_s16_t dct0v, dct1v; vec_s16_t tmp0v, tmp1v; dct0v = vec_ld(0x00, dct); dct1v = vec_ld(0x10, dct); const vec_u8_t sel0 = (vec_u8_t) CV(0,1,2,3,8,9,4,5,6,7,10,11,12,13,14,15); tmp0v = vec_perm( dct0v, dct1v, sel0 ); tmp1v = dct1v; vec_st( tmp0v, 0x00, level ); vec_st( tmp1v, 0x10, level ); } void x264_zigzag_scan_8x8_frame_altivec( int16_t level[64], int16_t dct[64] ) { vec_s16_t tmpv[6]; vec_s16_t dct0v = vec_ld( 0*16, dct ); vec_s16_t dct1v = vec_ld( 1*16, dct ); vec_s16_t dct2v = vec_ld( 2*16, dct ); vec_s16_t dct3v = vec_ld( 3*16, dct ); vec_s16_t dct4v = vec_ld( 4*16, dct ); vec_s16_t dct5v = vec_ld( 5*16, dct ); vec_s16_t dct6v = vec_ld( 6*16, dct ); vec_s16_t dct7v = vec_ld( 7*16, dct ); const vec_u8_t mask1[14] = { { 0x00, 0x01, 0x02, 0x03, 0x12, 0x13, 0x14, 0x15, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D }, { 0x0A, 0x0B, 0x0C, 0x0D, 0x00, 0x00, 0x0E, 0x0F, 0x00, 0x00, 0x00, 0x00, 0x10, 0x11, 0x12, 0x13 }, { 0x00, 0x01, 0x02, 0x03, 0x18, 0x19, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F }, { 0x00, 0x00, 0x14, 0x15, 0x18, 0x19, 0x02, 0x03, 0x04, 0x05, 0x08, 0x09, 0x06, 0x07, 0x12, 0x13 }, { 0x12, 0x13, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F }, { 0x1A, 0x1B, 0x10, 0x11, 0x08, 0x09, 0x04, 0x05, 0x02, 0x03, 0x0C, 0x0D, 0x14, 0x15, 0x18, 0x19 }, { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x0A, 0x0B }, { 0x00, 0x01, 0x02, 0x03, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x06, 0x07, 0x04, 0x05, 0x08, 0x09 }, { 0x00, 0x11, 0x16, 0x17, 0x18, 0x19, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x1A, 0x1B }, { 0x02, 0x03, 0x18, 0x19, 0x16, 0x17, 0x1A, 0x1B, 0x1C, 0x1D, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09 }, { 0x08, 0x09, 0x0A, 0x0B, 0x06, 0x07, 0x0E, 0x0F, 0x10, 0x11, 0x00, 0x00, 0x12, 0x13, 0x14, 0x15 }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F }, { 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x06, 0x07, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F } }; tmpv[0] = vec_mergeh( dct0v, dct1v ); tmpv[1] = vec_mergeh( dct2v, dct3v ); tmpv[2] = (vec_s16_t)vec_mergeh( (vec_s32_t)tmpv[0], (vec_s32_t)tmpv[1] ); tmpv[3] = vec_perm( tmpv[2], dct0v, mask1[0] ); vec_st( tmpv[3], 0*16, level ); tmpv[4] = vec_mergeh( dct4v, dct5v ); tmpv[3] = vec_perm( tmpv[0], tmpv[4], mask1[1] ); tmpv[3] = vec_perm( tmpv[3], dct0v, mask1[2] ); tmpv[3] = vec_perm( tmpv[3], tmpv[1], mask1[3] ); vec_st( tmpv[3], 1*16, level ); tmpv[3] = vec_mergel( dct0v, dct1v ); tmpv[1] = vec_mergel( tmpv[1], dct2v ); tmpv[5] = vec_perm( tmpv[3], tmpv[1], mask1[4] ); tmpv[5] = vec_perm( tmpv[5], dct4v, mask1[5] ); vec_st( tmpv[5], 2*16, level ); tmpv[2] = vec_mergeh( dct5v, dct6v ); tmpv[5] = vec_mergeh( tmpv[2], dct7v ); tmpv[4] = vec_mergel( tmpv[4], tmpv[1] ); tmpv[0] = vec_perm( tmpv[5], tmpv[4], mask1[6] ); vec_st( tmpv[0], 3*16, level ); tmpv[1] = vec_mergel( dct2v, dct3v ); tmpv[0] = vec_mergel( dct4v, dct5v ); tmpv[4] = vec_perm( tmpv[1], tmpv[0], mask1[7] ); tmpv[3] = vec_perm( tmpv[4], tmpv[3], mask1[8] ); vec_st( tmpv[3], 4*16, level ); tmpv[3] = vec_mergeh( dct6v, dct7v ); tmpv[2] = vec_mergel( dct3v, dct4v ); tmpv[2] = vec_perm( tmpv[2], dct5v, mask1[9] ); tmpv[3] = vec_perm( tmpv[2], tmpv[3], mask1[10] ); vec_st( tmpv[3], 5*16, level ); tmpv[1] = vec_mergel( tmpv[1], tmpv[2] ); tmpv[2] = vec_mergel( dct6v, dct7v ); tmpv[1] = vec_perm( tmpv[1], tmpv[2], mask1[11] ); tmpv[1] = vec_perm( tmpv[1], dct7v, mask1[12] ); vec_st( tmpv[1], 6*16, level ); tmpv[2] = vec_perm( tmpv[2], tmpv[0], mask1[13] ); vec_st( tmpv[2], 7*16, level ); } void x264_zigzag_interleave_8x8_cavlc_altivec( int16_t *dst, int16_t *src, uint8_t *nnz ) { vec_s16_t tmpv[8]; vec_s16_t merge[2]; vec_s16_t permv[3]; vec_s16_t orv[4]; vec_s16_t src0v = vec_ld( 0*16, src ); vec_s16_t src1v = vec_ld( 1*16, src ); vec_s16_t src2v = vec_ld( 2*16, src ); vec_s16_t src3v = vec_ld( 3*16, src ); vec_s16_t src4v = vec_ld( 4*16, src ); vec_s16_t src5v = vec_ld( 5*16, src ); vec_s16_t src6v = vec_ld( 6*16, src ); vec_s16_t src7v = vec_ld( 7*16, src ); vec_u8_t pack; vec_u8_t nnzv = vec_vsx_ld( 0, nnz ); vec_u8_t shift = vec_splat_u8( 7 ); LOAD_ZERO; const vec_u8_t mask[3] = { { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }, { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }, { 0x10, 0x11, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x12, 0x13, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F } }; tmpv[0] = vec_mergeh( src0v, src1v ); tmpv[1] = vec_mergel( src0v, src1v ); tmpv[2] = vec_mergeh( src2v, src3v ); tmpv[3] = vec_mergel( src2v, src3v ); tmpv[4] = vec_mergeh( src4v, src5v ); tmpv[5] = vec_mergel( src4v, src5v ); tmpv[6] = vec_mergeh( src6v, src7v ); tmpv[7] = vec_mergel( src6v, src7v ); merge[0] = vec_mergeh( tmpv[0], tmpv[1] ); merge[1] = vec_mergeh( tmpv[2], tmpv[3] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[1] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 0*16, dst ); merge[0] = vec_mergeh( tmpv[4], tmpv[5] ); merge[1] = vec_mergeh( tmpv[6], tmpv[7] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[2] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 1*16, dst ); vec_st( permv[1], 2*16, dst ); vec_st( permv[2], 3*16, dst ); merge[0] = vec_mergel( tmpv[0], tmpv[1] ); merge[1] = vec_mergel( tmpv[2], tmpv[3] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[1] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 4*16, dst ); merge[0] = vec_mergel( tmpv[4], tmpv[5] ); merge[1] = vec_mergel( tmpv[6], tmpv[7] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[2] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 5*16, dst ); vec_st( permv[1], 6*16, dst ); vec_st( permv[2], 7*16, dst ); orv[0] = vec_or( src0v, src1v ); orv[1] = vec_or( src2v, src3v ); orv[2] = vec_or( src4v, src5v ); orv[3] = vec_or( src6v, src7v ); permv[0] = vec_or( orv[0], orv[1] ); permv[1] = vec_or( orv[2], orv[3] ); permv[0] = vec_or( permv[0], permv[1] ); permv[1] = vec_perm( permv[0], permv[0], mask[1] ); permv[0] = vec_or( permv[0], permv[1] ); pack = (vec_u8_t)vec_packs( permv[0], permv[0] ); pack = (vec_u8_t)vec_cmpeq( pack, zerov ); pack = vec_nor( pack, zerov ); pack = vec_sr( pack, shift ); nnzv = vec_perm( nnzv, pack, mask[2] ); vec_st( nnzv, 0, nnz ); } #endif // !HIGH_BIT_DEPTH x264-master/common/ppc/dct.h000066400000000000000000000075121502133446700160610ustar00rootroot00000000000000/***************************************************************************** * dct.h: ppc transform and zigzag ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Eric Petit * Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PPC_DCT_H #define X264_PPC_DCT_H #define x264_sub4x4_dct_altivec x264_template(sub4x4_dct_altivec) void x264_sub4x4_dct_altivec( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_altivec x264_template(sub8x8_dct_altivec) void x264_sub8x8_dct_altivec( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_altivec x264_template(sub16x16_dct_altivec) void x264_sub16x16_dct_altivec( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_add8x8_idct_dc_altivec x264_template(add8x8_idct_dc_altivec) void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] ); #define x264_add16x16_idct_dc_altivec x264_template(add16x16_idct_dc_altivec) void x264_add16x16_idct_dc_altivec( uint8_t *p_dst, int16_t dct[16] ); #define x264_add4x4_idct_altivec x264_template(add4x4_idct_altivec) void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[16] ); #define x264_add8x8_idct_altivec x264_template(add8x8_idct_altivec) void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][16] ); #define x264_add16x16_idct_altivec x264_template(add16x16_idct_altivec) void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] ); #define x264_sub8x8_dct_dc_altivec x264_template(sub8x8_dct_dc_altivec) void x264_sub8x8_dct_dc_altivec( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct8_altivec x264_template(sub8x8_dct8_altivec) void x264_sub8x8_dct8_altivec( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct8_altivec x264_template(sub16x16_dct8_altivec) void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); #define x264_add8x8_idct8_altivec x264_template(add8x8_idct8_altivec) void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[64] ); #define x264_add16x16_idct8_altivec x264_template(add16x16_idct8_altivec) void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][64] ); #define x264_zigzag_scan_4x4_frame_altivec x264_template(zigzag_scan_4x4_frame_altivec) void x264_zigzag_scan_4x4_frame_altivec( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_4x4_field_altivec x264_template(zigzag_scan_4x4_field_altivec) void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_8x8_frame_altivec x264_template(zigzag_scan_8x8_frame_altivec) void x264_zigzag_scan_8x8_frame_altivec( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_interleave_8x8_cavlc_altivec x264_template(zigzag_interleave_8x8_cavlc_altivec) void x264_zigzag_interleave_8x8_cavlc_altivec( int16_t *dst, int16_t *src, uint8_t *nnz ); #endif x264-master/common/ppc/deblock.c000066400000000000000000000415431502133446700167070ustar00rootroot00000000000000/***************************************************************************** * deblock.c: ppc deblocking ***************************************************************************** * Copyright (C) 2007-2025 x264 project * * Authors: Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "ppccommon.h" #include "deblock.h" #if !HIGH_BIT_DEPTH #define transpose4x16(r0, r1, r2, r3) \ { \ register vec_u8_t r4; \ register vec_u8_t r5; \ register vec_u8_t r6; \ register vec_u8_t r7; \ \ r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ \ r0 = vec_mergeh(r4, r6); /*all set 0*/ \ r1 = vec_mergel(r4, r6); /*all set 1*/ \ r2 = vec_mergeh(r5, r7); /*all set 2*/ \ r3 = vec_mergel(r5, r7); /*all set 3*/ \ } static inline void write16x4( uint8_t *dst, int dst_stride, register vec_u8_t r0, register vec_u8_t r1, register vec_u8_t r2, register vec_u8_t r3 ) { ALIGNED_16(unsigned char result[64]); uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; int int_dst_stride = dst_stride >> 2; vec_st(r0, 0, result); vec_st(r1, 16, result); vec_st(r2, 32, result); vec_st(r3, 48, result); /* FIXME: there has to be a better way!!!! */ *dst_int = *src_int; *(dst_int+ int_dst_stride) = *(src_int + 1); *(dst_int+ 2*int_dst_stride) = *(src_int + 2); *(dst_int+ 3*int_dst_stride) = *(src_int + 3); *(dst_int+ 4*int_dst_stride) = *(src_int + 4); *(dst_int+ 5*int_dst_stride) = *(src_int + 5); *(dst_int+ 6*int_dst_stride) = *(src_int + 6); *(dst_int+ 7*int_dst_stride) = *(src_int + 7); *(dst_int+ 8*int_dst_stride) = *(src_int + 8); *(dst_int+ 9*int_dst_stride) = *(src_int + 9); *(dst_int+10*int_dst_stride) = *(src_int + 10); *(dst_int+11*int_dst_stride) = *(src_int + 11); *(dst_int+12*int_dst_stride) = *(src_int + 12); *(dst_int+13*int_dst_stride) = *(src_int + 13); *(dst_int+14*int_dst_stride) = *(src_int + 14); *(dst_int+15*int_dst_stride) = *(src_int + 15); } /** \brief performs a 6x16 transpose of data in src, and stores it to dst */ #define read_and_transpose16x6(src, src_stride, r8, r9, r10, r11, r12, r13)\ {\ register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\ r0 = vec_vsx_ld(0, src); \ r1 = vec_vsx_ld(src_stride, src); \ r2 = vec_vsx_ld(2*src_stride, src); \ r3 = vec_vsx_ld(3*src_stride, src); \ r4 = vec_vsx_ld(4*src_stride, src); \ r5 = vec_vsx_ld(5*src_stride, src); \ r6 = vec_vsx_ld(6*src_stride, src); \ r7 = vec_vsx_ld(7*src_stride, src); \ r8 = vec_vsx_ld(8*src_stride, src); \ r9 = vec_vsx_ld(9*src_stride, src); \ r10 = vec_vsx_ld(10*src_stride, src); \ r11 = vec_vsx_ld(11*src_stride, src); \ r12 = vec_vsx_ld(12*src_stride, src); \ r13 = vec_vsx_ld(13*src_stride, src); \ r14 = vec_vsx_ld(14*src_stride, src); \ r15 = vec_vsx_ld(15*src_stride, src); \ \ /*Merge first pairs*/ \ r0 = vec_mergeh(r0, r8); /*0, 8*/ \ r1 = vec_mergeh(r1, r9); /*1, 9*/ \ r2 = vec_mergeh(r2, r10); /*2,10*/ \ r3 = vec_mergeh(r3, r11); /*3,11*/ \ r4 = vec_mergeh(r4, r12); /*4,12*/ \ r5 = vec_mergeh(r5, r13); /*5,13*/ \ r6 = vec_mergeh(r6, r14); /*6,14*/ \ r7 = vec_mergeh(r7, r15); /*7,15*/ \ \ /*Merge second pairs*/ \ r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ \ /*Third merge*/ \ r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ /* Don't need to compute 3 and 7*/ \ \ /*Final merge*/ \ r8 = vec_mergeh(r0, r4); /*all set 0*/ \ r9 = vec_mergel(r0, r4); /*all set 1*/ \ r10 = vec_mergeh(r1, r5); /*all set 2*/ \ r11 = vec_mergel(r1, r5); /*all set 3*/ \ r12 = vec_mergeh(r2, r6); /*all set 4*/ \ r13 = vec_mergel(r2, r6); /*all set 5*/ \ /* Don't need to compute 14 and 15*/ \ \ } // out: o = |x-y| < a static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a ) { return (vec_u8_t)vec_cmplt(vec_absd(x, y), a); } static inline vec_u8_t h264_deblock_mask( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t q0, register vec_u8_t q1, register vec_u8_t alpha, register vec_u8_t beta ) { register vec_u8_t mask; register vec_u8_t tempmask; mask = diff_lt_altivec(p0, q0, alpha); tempmask = diff_lt_altivec(p1, p0, beta); mask = vec_and(mask, tempmask); tempmask = diff_lt_altivec(q1, q0, beta); mask = vec_and(mask, tempmask); return mask; } // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0 ) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; } #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) \ { \ const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ \ register vec_u8_t pq0bit = vec_xor(p0,q0); \ register vec_u8_t q1minus; \ register vec_u8_t p0minus; \ register vec_u8_t stage1; \ register vec_u8_t stage2; \ register vec_u8_t vec160; \ register vec_u8_t delta; \ register vec_u8_t deltaneg; \ \ q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */\ stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ vec160 = vec_ld(0, &A0v); \ deltaneg = vec_subs(vec160, stage2); /* -d */ \ delta = vec_subs(stage2, vec160); /* d */ \ deltaneg = vec_min(tc0masked, deltaneg); \ delta = vec_min(tc0masked, delta); \ p0 = vec_subs(p0, deltaneg); \ q0 = vec_subs(q0, delta); \ p0 = vec_adds(p0, delta); \ q0 = vec_adds(q0, deltaneg); \ } #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) \ { \ ALIGNED_16(unsigned char temp[16]); \ register vec_u8_t alphavec; \ register vec_u8_t betavec; \ register vec_u8_t mask; \ register vec_u8_t p1mask; \ register vec_u8_t q1mask; \ register vec_s8_t tc0vec; \ register vec_u8_t finaltc0; \ register vec_u8_t tc0masked; \ register vec_u8_t newp1; \ register vec_u8_t newq1; \ \ temp[0] = alpha; \ temp[1] = beta; \ alphavec = vec_ld(0, temp); \ betavec = vec_splat(alphavec, 0x1); \ alphavec = vec_splat(alphavec, 0x0); \ mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ \ M32( temp ) = M32( tc0 ); \ tc0vec = vec_ld(0, (signed char*)temp); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \ mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ \ p1mask = diff_lt_altivec(p2, p0, betavec); \ p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta ) */ \ tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ /*end if*/ \ \ q1mask = diff_lt_altivec(q2, q0, betavec); \ q1mask = vec_and(q1mask, mask); /* if( |q2 - q0| < beta ) */ \ tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ /*end if*/ \ \ h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ p1 = newp1; \ q1 = newq1; \ } void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0 ) { register vec_u8_t p2 = vec_ld(-3*stride, pix); register vec_u8_t p1 = vec_ld(-2*stride, pix); register vec_u8_t p0 = vec_ld(-1*stride, pix); register vec_u8_t q0 = vec_ld(0, pix); register vec_u8_t q1 = vec_ld(stride, pix); register vec_u8_t q2 = vec_ld(2*stride, pix); h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); vec_st(p1, -2*stride, pix); vec_st(p0, -1*stride, pix); vec_st(q0, 0, pix); vec_st(q1, stride, pix); } } void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { register vec_u8_t line0, line1, line2, line3, line4, line5; if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0 ) return; read_and_transpose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); transpose4x16(line1, line2, line3, line4); write16x4(pix-2, stride, line1, line2, line3, line4); } #endif // !HIGH_BIT_DEPTH x264-master/common/ppc/deblock.h000066400000000000000000000031331502133446700167050ustar00rootroot00000000000000/***************************************************************************** * deblock.h: ppc deblocking ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PPC_DEBLOCK_H #define X264_PPC_DEBLOCK_H #define x264_deblock_v_luma_altivec x264_template(deblock_v_luma_altivec) void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_altivec x264_template(deblock_h_luma_altivec) void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif x264-master/common/ppc/mc.c000066400000000000000000001501651502133446700157040ustar00rootroot00000000000000/***************************************************************************** * mc.c: ppc motion compensation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Eric Petit * Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "ppccommon.h" #include "mc.h" #if !HIGH_BIT_DEPTH typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src, uint8_t *dst, intptr_t i_dst, int i_height ); static inline void pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { for( int y = 0; y < i_height; y++ ) { #ifndef __POWER9_VECTOR__ for( int x = 0; x < 4; x++ ) dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; #else vec_u8_t s1 = vec_vsx_ld( 0, src1 ); vec_u8_t s2 = vec_vsx_ld( 0, src2 ); vec_u8_t avg = vec_avg( s1, s2 ); vec_xst_len( avg, dst, 4 ); #endif dst += i_dst; src1 += i_src1; src2 += i_src1; } } static inline void pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { vec_u8_t src1v, src2v; for( int y = 0; y < i_height; y++ ) { src1v = vec_vsx_ld( 0, src1 ); src2v = vec_vsx_ld( 0, src2 ); src1v = vec_avg( src1v, src2v ); VEC_STORE8(src1v, dst); dst += i_dst; src1 += i_src1; src2 += i_src1; } } static inline void pixel_avg2_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { vec_u8_t src1v, src2v; for( int y = 0; y < i_height; y++ ) { src1v = vec_vsx_ld( 0, src1 ); src2v = vec_vsx_ld( 0, src2 ); src1v = vec_avg( src1v, src2v ); vec_st(src1v, 0, dst); dst += i_dst; src1 += i_src1; src2 += i_src1; } } static inline void pixel_avg2_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height); pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height); } /* mc_copy: plain c */ #ifndef __POWER9_VECTOR__ #define tiny_copy( d, s, l ) memcpy( d, s, l ) #else #define tiny_copy( d, s, l ) vec_xst_len( vec_vsx_ld( 0, s ), d, l ) #endif #define MC_COPY( name, a ) \ static void name( uint8_t *dst, intptr_t i_dst, \ uint8_t *src, intptr_t i_src, int i_height ) \ { \ int y; \ for( y = 0; y < i_height; y++ ) \ { \ memcpy( dst, src, a ); \ src += i_src; \ dst += i_dst; \ } \ } MC_COPY( mc_copy_w4_altivec, 4 ) MC_COPY( mc_copy_w8_altivec, 8 ) static void mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, int i_height ) { vec_u8_t cpyV; for( int y = 0; y < i_height; y++ ) { cpyV = vec_vsx_ld( 0, src ); vec_st(cpyV, 0, dst); src += i_src; dst += i_dst; } } static void mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, int i_height ) { for( int y = 0; y < i_height; ++y ) { vec_u8_t cpyV = vec_ld( 0, src ); vec_st(cpyV, 0, dst); src += i_src; dst += i_dst; } } #define x264_plane_copy_swap_core_altivec x264_template(plane_copy_swap_core_altivec) void x264_plane_copy_swap_core_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, int w, int h ) { const vec_u8_t mask = { 0x01, 0x00, 0x03, 0x02, 0x05, 0x04, 0x07, 0x06, 0x09, 0x08, 0x0B, 0x0A, 0x0D, 0x0C, 0x0F, 0x0E }; for( int y = 0; y < h; y++, dst += i_dst, src += i_src ) for( int x = 0; x < 2 * w; x += 16 ) { vec_u8_t srcv = vec_vsx_ld( x, src ); vec_u8_t dstv = vec_perm( srcv, srcv, mask ); vec_vsx_st( dstv, x, dst ); } } #define x264_plane_copy_interleave_core_altivec x264_template(plane_copy_interleave_core_altivec) void x264_plane_copy_interleave_core_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, intptr_t i_srcu, uint8_t *srcv, intptr_t i_srcv, int w, int h ) { for( int y = 0; y < h; y++, dst += i_dst, srcu += i_srcu, srcv += i_srcv ) for( int x = 0; x < w; x += 16 ) { vec_u8_t srcvv = vec_vsx_ld( x, srcv ); vec_u8_t srcuv = vec_vsx_ld( x, srcu ); vec_u8_t dstv1 = vec_mergeh( srcuv, srcvv ); vec_u8_t dstv2 = vec_mergel( srcuv, srcvv ); vec_vsx_st( dstv1, 2 * x, dst ); vec_vsx_st( dstv2, 2 * x + 16, dst ); } } void x264_store_interleave_chroma_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) { for( int y = 0; y < height; y++, dst += i_dst, srcu += FDEC_STRIDE, srcv += FDEC_STRIDE ) { vec_u8_t srcvv = vec_vsx_ld( 0, srcv ); vec_u8_t srcuv = vec_vsx_ld( 0, srcu ); vec_u8_t dstv = vec_mergeh( srcuv, srcvv ); vec_vsx_st(dstv, 0, dst); } } void x264_plane_copy_deinterleave_altivec( uint8_t *dstu, intptr_t i_dstu, uint8_t *dstv, intptr_t i_dstv, uint8_t *src, intptr_t i_src, int w, int h ) { const vec_u8_t mask[2] = { { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E }, { 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F } }; for( int y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src ) { for( int x = 0; x < w; x += 16 ) { vec_u8_t srcv1 = vec_vsx_ld( 2 * x, src ); vec_u8_t srcv2 = vec_vsx_ld( 2 * x + 16, src ); vec_u8_t dstuv = vec_perm( srcv1, srcv2, mask[0] ); vec_u8_t dstvv = vec_perm( srcv1, srcv2, mask[1] ); vec_vsx_st( dstuv, x, dstu ); vec_vsx_st( dstvv, x, dstv ); } } } static void load_deinterleave_chroma_fenc_altivec( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ) { const vec_u8_t mask = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F }; for( int y = 0; y < height; y += 2, dst += 2*FENC_STRIDE, src += 2*i_src ) { vec_u8_t src0 = vec_ld( 0, src ); vec_u8_t src1 = vec_ld( i_src, src ); vec_st( vec_perm( src0, src0, mask ), 0*FENC_STRIDE, dst ); vec_st( vec_perm( src1, src1, mask ), 1*FENC_STRIDE, dst ); } } #if HAVE_VSX void x264_plane_copy_deinterleave_rgb_altivec( uint8_t *dsta, intptr_t i_dsta, uint8_t *dstb, intptr_t i_dstb, uint8_t *dstc, intptr_t i_dstc, uint8_t *src, intptr_t i_src, int pw, int w, int h ) { if( pw == 3 ) { const vec_u8_t mask[4] = { { 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0x10, 0x13, 0x16 }, { 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x18, 0x1B, 0x1E }, { 0x02, 0x05, 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }, { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F } }; for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src ) { for( int x = 0; x < w; x += 16 ) { vec_u8_t srcv1 = vec_vsx_ld( 3 * x, src ); vec_u8_t srcv2 = vec_vsx_ld( 3 * x + 16, src ); vec_u8_t srcv3 = vec_vsx_ld( 3 * x + 32, src ); vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv2, srcv3, mask[1] ); // a8 a9 a10 a11 a12 a13 a14 a15 b8 b9 b10 b11 b12 b13 b14 b15 vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta ); vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb ); srcv1 = vec_perm( srcv1, srcv2, mask[2] ); // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 srcv1 = vec_perm( srcv1, srcv3, mask[3] ); // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 vec_st( srcv1, x, dstc ); } } } else { const vec_u8_t mask[2] = { { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D }, { 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x17, 0x1B, 0x1F } }; for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src ) { for( int x = 0; x < w; x += 16 ) { vec_u8_t srcv1 = vec_vsx_ld( 4 * x, src ); vec_u8_t srcv2 = vec_vsx_ld( 4 * x + 16, src ); vec_u8_t srcv3 = vec_vsx_ld( 4 * x + 32, src ); vec_u8_t srcv4 = vec_vsx_ld( 4 * x + 48, src ); vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[0] ); // a8 a9 a10 a11 a12 a13 a14 a15 b8 b9 b10 b11 b12 b13 b14 b15 vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta ); vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb ); tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[1] ); // c0 c1 c2 c3 c4 c5 c6 c7 tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[1] ); // c8 c9 c10 c11 c12 c13 c14 c15 vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dstc ); } } } } #endif static void mc_luma_altivec( uint8_t *dst, intptr_t i_dst_stride, uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); switch( i_width ) { case 4: pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 8: pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 16: default: pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); } if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height ); } else if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height ); else { switch( i_width ) { case 4: mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height ); break; case 8: mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height ); break; case 16: mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height ); break; } } } static uint8_t *get_ref_altivec( uint8_t *dst, intptr_t *i_dst_stride, uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); switch( i_width ) { case 4: pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 8: pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 12: case 16: default: pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 20: pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; } if( weight->weightfn ) weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height ); return dst; } else if( weight->weightfn ) { weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height ); return dst; } else { *i_dst_stride = i_src_stride; return src1; } } static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, uint8_t *src, intptr_t i_src_stride, int mvx, int mvy, int i_height ) { uint8_t *srcp; int d8x = mvx&0x07; int d8y = mvy&0x07; int cA = (8-d8x)*(8-d8y); int cB = d8x *(8-d8y); int cC = (8-d8x)*d8y; int cD = d8x *d8y; src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; for( int y = 0; y < i_height; y++ ) { dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6; dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6; dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6; dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6; src += i_src_stride; srcp += i_src_stride; dstu += i_dst_stride; dstv += i_dst_stride; } } #ifdef WORDS_BIGENDIAN #define VSLD(a,b,n) vec_sld(a,b,n) #else #define VSLD(a,b,n) vec_sld(b,a,16-n) #endif #ifndef __POWER9_VECTOR__ #define STORE4_ALIGNED(d, s) vec_ste( (vec_u32_t)s, 0, (uint32_t*) d ) #define STORE2_UNALIGNED(d, s) vec_ste( vec_splat( (vec_u16_t)s, 0 ), 0, (uint16_t*)d ) #else #define STORE4_ALIGNED(d, s) vec_xst_len( (vec_u8_t)s, d, 4 ) #define STORE2_UNALIGNED(d, s) vec_xst_len( (vec_u8_t)s, d, 2 ) #endif static void mc_chroma_4xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, uint8_t *src, intptr_t i_src_stride, int mvx, int mvy, int i_height ) { uint8_t *srcp; int d8x = mvx & 0x07; int d8y = mvy & 0x07; ALIGNED_16( uint16_t coeff[4] ); coeff[0] = (8-d8x)*(8-d8y); coeff[1] = d8x *(8-d8y); coeff[2] = (8-d8x)*d8y; coeff[3] = d8x *d8y; src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; LOAD_ZERO; vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v; vec_u8_t src2v_8, dstuv, dstvv; vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16; vec_u16_t shiftv, k32v; #ifdef WORDS_BIGENDIAN static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13); static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15); #else static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12); static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14); #endif coeff0v = vec_ld( 0, coeff ); coeff3v = vec_splat( coeff0v, 3 ); coeff2v = vec_splat( coeff0v, 2 ); coeff1v = vec_splat( coeff0v, 1 ); coeff0v = vec_splat( coeff0v, 0 ); k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) ); shiftv = vec_splat_u16( 6 ); src2v_8 = vec_vsx_ld( 0, src ); src2v_16 = vec_u8_to_u16( src2v_8 ); src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) ); for( int y = 0; y < i_height; y += 2 ) { src0v_16 = src2v_16; src1v_16 = src3v_16; src2v_8 = vec_vsx_ld( 0, srcp ); src2v_16 = vec_u8_to_u16( src2v_8 ); src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) ); dstv16 = vec_mladd( coeff0v, src0v_16, k32v ); dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 ); dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 ); dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 ); dstv16 = vec_sr( dstv16, shiftv ); dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v ); dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v ); STORE4_ALIGNED( dstu, dstuv ); STORE4_ALIGNED( dstv, dstvv ); srcp += i_src_stride; dstu += i_dst_stride; dstv += i_dst_stride; src0v_16 = src2v_16; src1v_16 = src3v_16; src2v_8 = vec_vsx_ld( 0, srcp ); src2v_16 = vec_u8_to_u16( src2v_8 ); src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) ); dstv16 = vec_mladd( coeff0v, src0v_16, k32v ); dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 ); dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 ); dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 ); dstv16 = vec_sr( dstv16, shiftv ); dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v ); dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v ); STORE4_ALIGNED( dstu, dstuv ); STORE4_ALIGNED( dstv, dstvv ); srcp += i_src_stride; dstu += i_dst_stride; dstv += i_dst_stride; } } static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, uint8_t *src, intptr_t i_src_stride, int mvx, int mvy, int i_height ) { uint8_t *srcp; int d8x = mvx & 0x07; int d8y = mvy & 0x07; ALIGNED_16( uint16_t coeff[4] ); coeff[0] = (8-d8x)*(8-d8y); coeff[1] = d8x *(8-d8y); coeff[2] = (8-d8x)*d8y; coeff[3] = d8x *d8y; src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2; srcp = &src[i_src_stride]; LOAD_ZERO; vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v; vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8; vec_u8_t dstuv, dstvv; vec_u16_t src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h; vec_u16_t src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l; vec_u16_t shiftv, k32v; coeff0v = vec_ld( 0, coeff ); coeff3v = vec_splat( coeff0v, 3 ); coeff2v = vec_splat( coeff0v, 2 ); coeff1v = vec_splat( coeff0v, 1 ); coeff0v = vec_splat( coeff0v, 0 ); k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) ); shiftv = vec_splat_u16( 6 ); #ifdef WORDS_BIGENDIAN static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0); static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0); #else static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1); static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1); #endif src2v_8 = vec_vsx_ld( 0, src ); src3v_8 = vec_vsx_ld( 16, src ); src3v_8 = VSLD( src2v_8, src3v_8, 2 ); for( int y = 0; y < i_height; y += 2 ) { src0v_8 = src2v_8; src1v_8 = src3v_8; src2v_8 = vec_vsx_ld( 0, srcp ); src3v_8 = vec_vsx_ld( 16, srcp ); src3v_8 = VSLD( src2v_8, src3v_8, 2 ); src0v_16h = vec_u8_to_u16_h( src0v_8 ); src0v_16l = vec_u8_to_u16_l( src0v_8 ); src1v_16h = vec_u8_to_u16_h( src1v_8 ); src1v_16l = vec_u8_to_u16_l( src1v_8 ); src2v_16h = vec_u8_to_u16_h( src2v_8 ); src2v_16l = vec_u8_to_u16_l( src2v_8 ); src3v_16h = vec_u8_to_u16_h( src3v_8 ); src3v_16l = vec_u8_to_u16_l( src3v_8 ); dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v ); dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v ); dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h ); dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l ); dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h ); dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l ); dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h ); dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l ); dstv_16h = vec_sr( dstv_16h, shiftv ); dstv_16l = vec_sr( dstv_16l, shiftv ); dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v ); dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v ); VEC_STORE8( dstuv, dstu ); VEC_STORE8( dstvv, dstv ); srcp += i_src_stride; dstu += i_dst_stride; dstv += i_dst_stride; src0v_8 = src2v_8; src1v_8 = src3v_8; src2v_8 = vec_vsx_ld( 0, srcp ); src3v_8 = vec_vsx_ld( 16, srcp ); src3v_8 = VSLD( src2v_8, src3v_8, 2 ); src0v_16h = vec_u8_to_u16_h( src0v_8 ); src0v_16l = vec_u8_to_u16_l( src0v_8 ); src1v_16h = vec_u8_to_u16_h( src1v_8 ); src1v_16l = vec_u8_to_u16_l( src1v_8 ); src2v_16h = vec_u8_to_u16_h( src2v_8 ); src2v_16l = vec_u8_to_u16_l( src2v_8 ); src3v_16h = vec_u8_to_u16_h( src3v_8 ); src3v_16l = vec_u8_to_u16_l( src3v_8 ); dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v ); dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v ); dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h ); dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l ); dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h ); dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l ); dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h ); dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l ); dstv_16h = vec_sr( dstv_16h, shiftv ); dstv_16l = vec_sr( dstv_16l, shiftv ); dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v ); dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v ); VEC_STORE8( dstuv, dstu ); VEC_STORE8( dstvv, dstv ); srcp += i_src_stride; dstu += i_dst_stride; dstv += i_dst_stride; } } static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, uint8_t *src, intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height ) { if( i_width == 8 ) mc_chroma_8xh_altivec( dstu, dstv, i_dst_stride, src, i_src_stride, mvx, mvy, i_height ); else if( i_width == 4 ) mc_chroma_4xh_altivec( dstu, dstv, i_dst_stride, src, i_src_stride, mvx, mvy, i_height ); else mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride, mvx, mvy, i_height ); } #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \ { \ t1v = vec_add( t1v, t6v ); \ t2v = vec_add( t2v, t5v ); \ t3v = vec_add( t3v, t4v ); \ \ t1v = vec_sub( t1v, t2v ); /* (a-b) */ \ t2v = vec_sub( t2v, t3v ); /* (b-c) */ \ t2v = vec_sl( t2v, twov ); /* (b-c)*4 */ \ t1v = vec_sub( t1v, t2v ); /* a-5*b+4*c */ \ t3v = vec_sl( t3v, fourv ); /* 16*c */ \ t1v = vec_add( t1v, t3v ); /* a-5*b+20*c */ \ } #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \ { \ t1v = vec_add( t1v, t6v ); \ t2v = vec_add( t2v, t5v ); \ t3v = vec_add( t3v, t4v ); \ \ t1v = vec_sub( t1v, t2v ); /* (a-b) */ \ t1v = vec_sra( t1v, twov ); /* (a-b)/4 */ \ t1v = vec_sub( t1v, t2v ); /* (a-b)/4-b */ \ t1v = vec_add( t1v, t3v ); /* (a-b)/4-b+c */ \ t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \ t1v = vec_add( t1v, t3v ); /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \ } #define HPEL_FILTER_HORIZONTAL() \ { \ src1v = vec_vsx_ld( x- 2+i_stride*y, src ); \ src6v = vec_vsx_ld( x+14+i_stride*y, src ); \ \ src2v = VSLD( src1v, src6v, 1 ); \ src3v = VSLD( src1v, src6v, 2 ); \ src4v = VSLD( src1v, src6v, 3 ); \ src5v = VSLD( src1v, src6v, 4 ); \ src6v = VSLD( src1v, src6v, 5 ); \ \ temp1v = vec_u8_to_s16_h( src1v ); \ temp2v = vec_u8_to_s16_h( src2v ); \ temp3v = vec_u8_to_s16_h( src3v ); \ temp4v = vec_u8_to_s16_h( src4v ); \ temp5v = vec_u8_to_s16_h( src5v ); \ temp6v = vec_u8_to_s16_h( src6v ); \ \ HPEL_FILTER_1( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ \ dest1v = vec_add( temp1v, sixteenv ); \ dest1v = vec_sra( dest1v, fivev ); \ \ temp1v = vec_u8_to_s16_l( src1v ); \ temp2v = vec_u8_to_s16_l( src2v ); \ temp3v = vec_u8_to_s16_l( src3v ); \ temp4v = vec_u8_to_s16_l( src4v ); \ temp5v = vec_u8_to_s16_l( src5v ); \ temp6v = vec_u8_to_s16_l( src6v ); \ \ HPEL_FILTER_1( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ \ dest2v = vec_add( temp1v, sixteenv ); \ dest2v = vec_sra( dest2v, fivev ); \ \ destv = vec_packsu( dest1v, dest2v ); \ \ vec_vsx_st( destv, x+i_stride*y, dsth ); \ } #define HPEL_FILTER_VERTICAL() \ { \ src1v = vec_vsx_ld( x+i_stride*(y-2), src ); \ src2v = vec_vsx_ld( x+i_stride*(y-1), src ); \ src3v = vec_vsx_ld( x+i_stride*(y-0), src ); \ src4v = vec_vsx_ld( x+i_stride*(y+1), src ); \ src5v = vec_vsx_ld( x+i_stride*(y+2), src ); \ src6v = vec_vsx_ld( x+i_stride*(y+3), src ); \ \ temp1v = vec_u8_to_s16_h( src1v ); \ temp2v = vec_u8_to_s16_h( src2v ); \ temp3v = vec_u8_to_s16_h( src3v ); \ temp4v = vec_u8_to_s16_h( src4v ); \ temp5v = vec_u8_to_s16_h( src5v ); \ temp6v = vec_u8_to_s16_h( src6v ); \ \ HPEL_FILTER_1( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ \ dest1v = vec_add( temp1v, sixteenv ); \ dest1v = vec_sra( dest1v, fivev ); \ \ temp4v = vec_u8_to_s16_l( src1v ); \ temp5v = vec_u8_to_s16_l( src2v ); \ temp6v = vec_u8_to_s16_l( src3v ); \ temp7v = vec_u8_to_s16_l( src4v ); \ temp8v = vec_u8_to_s16_l( src5v ); \ temp9v = vec_u8_to_s16_l( src6v ); \ \ HPEL_FILTER_1( temp4v, temp5v, temp6v, \ temp7v, temp8v, temp9v ); \ \ dest2v = vec_add( temp4v, sixteenv ); \ dest2v = vec_sra( dest2v, fivev ); \ \ destv = vec_packsu( dest1v, dest2v ); \ \ vec_vsx_st( destv, x+i_stride*y, dstv ); \ } #define HPEL_FILTER_CENTRAL() \ { \ temp1v = VSLD( tempav, tempbv, 12 ); \ temp2v = VSLD( tempav, tempbv, 14 ); \ temp3v = tempbv; \ temp4v = VSLD( tempbv, tempcv, 2 ); \ temp5v = VSLD( tempbv, tempcv, 4 ); \ temp6v = VSLD( tempbv, tempcv, 6 ); \ \ HPEL_FILTER_2( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ \ dest1v = vec_add( temp1v, thirtytwov ); \ dest1v = vec_sra( dest1v, sixv ); \ \ temp1v = VSLD( tempbv, tempcv, 12 ); \ temp2v = VSLD( tempbv, tempcv, 14 ); \ temp3v = tempcv; \ temp4v = VSLD( tempcv, tempdv, 2 ); \ temp5v = VSLD( tempcv, tempdv, 4 ); \ temp6v = VSLD( tempcv, tempdv, 6 ); \ \ HPEL_FILTER_2( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ \ dest2v = vec_add( temp1v, thirtytwov ); \ dest2v = vec_sra( dest2v, sixv ); \ \ destv = vec_packsu( dest1v, dest2v ); \ \ vec_vsx_st( destv, x-16+i_stride*y, dstc ); \ } void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t i_stride, int i_width, int i_height, int16_t *buf ) { vec_u8_t destv; vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v; vec_s16_t dest1v, dest2v; vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v; vec_s16_t tempav, tempbv, tempcv, tempdv, tempev; LOAD_ZERO; vec_u16_t twov, fourv, fivev, sixv; vec_s16_t sixteenv, thirtytwov; twov = vec_splats( (uint16_t)2 ); fourv = vec_splats( (uint16_t)4 ); fivev = vec_splats( (uint16_t)5 ); sixv = vec_splats( (uint16_t)6 ); sixteenv = vec_splats( (int16_t)16 ); thirtytwov = vec_splats( (int16_t)32 ); for( int y = 0; y < i_height; y++ ) { int x = 0; /* horizontal_filter */ HPEL_FILTER_HORIZONTAL(); /* vertical_filter */ HPEL_FILTER_VERTICAL(); /* central_filter */ tempav = tempcv; tempbv = tempdv; tempcv = vec_splat( temp1v, 0 ); /* first only */ tempdv = temp1v; tempev = temp4v; for( x = 16; x < i_width; x+=16 ) { /* horizontal_filter */ HPEL_FILTER_HORIZONTAL(); /* vertical_filter */ HPEL_FILTER_VERTICAL(); /* central_filter */ tempav = tempcv; tempbv = tempdv; tempcv = tempev; tempdv = temp1v; tempev = temp4v; HPEL_FILTER_CENTRAL(); } /* Partial vertical filter */ src1v = vec_vsx_ld( x+i_stride*(y-2), src ); src2v = vec_vsx_ld( x+i_stride*(y-1), src ); src3v = vec_vsx_ld( x+i_stride*(y-0), src ); src4v = vec_vsx_ld( x+i_stride*(y+1), src ); src5v = vec_vsx_ld( x+i_stride*(y+2), src ); src6v = vec_vsx_ld( x+i_stride*(y+3), src ); temp1v = vec_u8_to_s16_h( src1v ); temp2v = vec_u8_to_s16_h( src2v ); temp3v = vec_u8_to_s16_h( src3v ); temp4v = vec_u8_to_s16_h( src4v ); temp5v = vec_u8_to_s16_h( src5v ); temp6v = vec_u8_to_s16_h( src6v ); HPEL_FILTER_1( temp1v, temp2v, temp3v, temp4v, temp5v, temp6v ); /* central_filter */ tempav = tempcv; tempbv = tempdv; tempcv = tempev; tempdv = temp1v; /* tempev is not used */ HPEL_FILTER_CENTRAL(); } } static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width, int height ) { int w = width >> 4; int end = (width & 15); vec_u8_t src0v, src1v, src2v; vec_u8_t lv, hv, src1p1v; vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv; static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E ); #ifndef WORDS_BIGENDIAN static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F ); #endif for( int y = 0; y < height; y++ ) { int x; uint8_t *src1 = src0+src_stride; uint8_t *src2 = src1+src_stride; src0v = vec_ld(0, src0); src1v = vec_ld(0, src1); src2v = vec_ld(0, src2); avg0v = vec_avg(src0v, src1v); avg1v = vec_avg(src1v, src2v); for( x = 0; x < w; x++ ) { lv = vec_ld(16*(x*2+1), src0); src1v = vec_ld(16*(x*2+1), src1); avghv = vec_avg(lv, src1v); lv = vec_ld(16*(x*2+2), src0); src1p1v = vec_ld(16*(x*2+2), src1); avghp1v = vec_avg(lv, src1p1v); avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v); avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv); vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0); #ifdef WORDS_BIGENDIAN vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth); #else vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth); #endif avg0v = avghp1v; hv = vec_ld(16*(x*2+1), src2); avghv = vec_avg(src1v, hv); hv = vec_ld(16*(x*2+2), src2); avghp1v = vec_avg(src1p1v, hv); avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v); avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv); vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv); #ifdef WORDS_BIGENDIAN vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc); #else vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc); #endif avg1v = avghp1v; } if( end ) { lv = vec_ld(16*(x*2+1), src0); src1v = vec_ld(16*(x*2+1), src1); avghv = vec_avg(lv, src1v); lv = vec_ld(16*(x*2+1), src2); avghp1v = vec_avg(src1v, lv); avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v); avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v); lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle); #ifdef WORDS_BIGENDIAN hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv); #else hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1); #endif VEC_STORE8( lv, dst0 + 16 * x ); VEC_STORE8( hv, dsth + 16 * x ); lv = vec_sld(lv, lv, 8); hv = vec_sld(hv, hv, 8); VEC_STORE8( lv, dstv + 16 * x ); VEC_STORE8( hv, dstc + 16 * x ); } src0 += src_stride*2; dst0 += dst_stride; dsth += dst_stride; dstv += dst_stride; dstc += dst_stride; } } static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; vec_u8_t srcv; vec_s16_t weightv; vec_s16_t scalev, offsetv, denomv, roundv; int denom = weight->i_denom; scalev = vec_splats( (int16_t)weight->i_scale ); offsetv = vec_splats( (int16_t)weight->i_offset ); if( denom >= 1 ) { denomv = vec_splats( (int16_t)denom ); roundv = vec_splats( (int16_t)(1 << (denom - 1)) ); for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weightv = vec_u8_to_s16( srcv ); weightv = vec_mladd( weightv, scalev, roundv ); weightv = vec_sra( weightv, (vec_u16_t)denomv ); weightv = vec_add( weightv, offsetv ); srcv = vec_packsu( weightv, zero_s16v ); STORE2_UNALIGNED( dst, srcv ); } } else { for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weightv = vec_u8_to_s16( srcv ); weightv = vec_mladd( weightv, scalev, offsetv ); srcv = vec_packsu( weightv, zero_s16v ); STORE2_UNALIGNED( dst, srcv ); } } } static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; vec_u8_t srcv; vec_s16_t weightv; vec_s16_t scalev, offsetv, denomv, roundv; int denom = weight->i_denom; scalev = vec_splats( (int16_t)weight->i_scale ); offsetv = vec_splats( (int16_t)weight->i_offset ); if( denom >= 1 ) { denomv = vec_splats( (int16_t)denom ); roundv = vec_splats( (int16_t)(1 << (denom - 1)) ); for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weightv = vec_u8_to_s16( srcv ); weightv = vec_mladd( weightv, scalev, roundv ); weightv = vec_sra( weightv, (vec_u16_t)denomv ); weightv = vec_add( weightv, offsetv ); srcv = vec_packsu( weightv, zero_s16v ); vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst ); } } else { for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weightv = vec_u8_to_s16( srcv ); weightv = vec_mladd( weightv, scalev, offsetv ); srcv = vec_packsu( weightv, zero_s16v ); vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst ); } } } static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; vec_u8_t srcv; vec_s16_t weightv; vec_s16_t scalev, offsetv, denomv, roundv; int denom = weight->i_denom; scalev = vec_splats( (int16_t)weight->i_scale ); offsetv = vec_splats( (int16_t)weight->i_offset ); if( denom >= 1 ) { denomv = vec_splats( (int16_t)denom ); roundv = vec_splats( (int16_t)(1 << (denom - 1)) ); for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weightv = vec_u8_to_s16( srcv ); weightv = vec_mladd( weightv, scalev, roundv ); weightv = vec_sra( weightv, (vec_u16_t)denomv ); weightv = vec_add( weightv, offsetv ); srcv = vec_packsu( weightv, zero_s16v ); VEC_STORE8( srcv, dst ); } } else { for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weightv = vec_u8_to_s16( srcv ); weightv = vec_mladd( weightv, scalev, offsetv ); srcv = vec_packsu( weightv, zero_s16v ); VEC_STORE8( srcv, dst ); } } } static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; vec_u8_t srcv; vec_s16_t weight_lv, weight_hv; vec_s16_t scalev, offsetv, denomv, roundv; int denom = weight->i_denom; scalev = vec_splats( (int16_t)weight->i_scale ); offsetv = vec_splats( (int16_t)weight->i_offset ); if( denom >= 1 ) { denomv = vec_splats( (int16_t)denom ); roundv = vec_splats( (int16_t)(1 << (denom - 1)) ); for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weight_hv = vec_u8_to_s16_h( srcv ); weight_lv = vec_u8_to_s16_l( srcv ); weight_hv = vec_mladd( weight_hv, scalev, roundv ); weight_lv = vec_mladd( weight_lv, scalev, roundv ); weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv ); weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv ); weight_hv = vec_add( weight_hv, offsetv ); weight_lv = vec_add( weight_lv, offsetv ); srcv = vec_packsu( weight_hv, weight_lv ); vec_st( srcv, 0, dst ); } } else { for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); weight_hv = vec_u8_to_s16_h( srcv ); weight_lv = vec_u8_to_s16_l( srcv ); weight_hv = vec_mladd( weight_hv, scalev, offsetv ); weight_lv = vec_mladd( weight_lv, scalev, offsetv ); srcv = vec_packsu( weight_hv, weight_lv ); vec_st( srcv, 0, dst ); } } } static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; vec_u8_t srcv, srcv2; vec_s16_t weight_lv, weight_hv, weight_3v; vec_s16_t scalev, offsetv, denomv, roundv; int denom = weight->i_denom; scalev = vec_splats( (int16_t)weight->i_scale ); offsetv = vec_splats( (int16_t)weight->i_offset ); if( denom >= 1 ) { int16_t round = 1 << (denom - 1); vec_s16_t tab[4] = { { weight->i_scale, weight->i_scale, weight->i_scale, weight->i_scale, 1, 1, 1, 1 }, { weight->i_offset, weight->i_offset, weight->i_offset, weight->i_offset, 0, 0, 0, 0 }, { denom, denom, denom, denom, 0, 0, 0, 0 }, { round, round, round, round, 0, 0, 0, 0 }, }; denomv = vec_splats( (int16_t)denom ); roundv = vec_splats( (int16_t)(1 << (denom - 1)) ); for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); srcv2 = vec_vsx_ld( 16, src ); weight_hv = vec_u8_to_s16_h( srcv ); weight_lv = vec_u8_to_s16_l( srcv ); weight_3v = vec_u8_to_s16_h( srcv2 ); weight_hv = vec_mladd( weight_hv, scalev, roundv ); weight_lv = vec_mladd( weight_lv, scalev, roundv ); weight_3v = vec_mladd( weight_3v, tab[0], tab[3] ); weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv ); weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv ); weight_3v = vec_sra( weight_3v, (vec_u16_t)tab[2] ); weight_hv = vec_add( weight_hv, offsetv ); weight_lv = vec_add( weight_lv, offsetv ); weight_3v = vec_add( weight_3v, tab[1] ); srcv = vec_packsu( weight_hv, weight_lv ); srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ) ); vec_vsx_st( srcv, 0, dst ); vec_vsx_st( srcv2, 16, dst ); } } else { vec_s16_t offset_mask = { weight->i_offset, weight->i_offset, weight->i_offset, weight->i_offset, 0, 0, 0, 0 }; for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src ) { srcv = vec_vsx_ld( 0, src ); srcv2 = vec_vsx_ld( 16, src ); weight_hv = vec_u8_to_s16_h( srcv ); weight_lv = vec_u8_to_s16_l( srcv ); weight_3v = vec_u8_to_s16_h( srcv2 ); weight_hv = vec_mladd( weight_hv, scalev, offsetv ); weight_lv = vec_mladd( weight_lv, scalev, offsetv ); weight_3v = vec_mladd( weight_3v, scalev, offset_mask ); srcv = vec_packsu( weight_hv, weight_lv ); srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ) ); vec_vsx_st( srcv, 0, dst ); vec_vsx_st( srcv2, 16, dst ); } } } static weight_fn_t mc_weight_wtab_altivec[6] = { mc_weight_w2_altivec, mc_weight_w4_altivec, mc_weight_w8_altivec, mc_weight_w16_altivec, mc_weight_w16_altivec, mc_weight_w20_altivec, }; PLANE_COPY_SWAP(16, altivec) PLANE_INTERLEAVE(altivec) #endif // !HIGH_BIT_DEPTH #if HIGH_BIT_DEPTH #define LOAD_SRC( l ) \ { \ srcv[l] = vec_vsx_ld( s, src ); \ s += 16; \ srcv[l + 1] = vec_vsx_ld( s, src ); \ s += 16; \ } #define STORE_8( mask, shift, dst, a, b ) \ { \ dstv = (vec_u16_t)vec_perm( srcv[a], srcv[b], mask ); \ dstv = vec_sr( dstv, shift ); \ dstv = vec_and( dstv, and_mask ); \ \ vec_st( dstv, offset, dst ); \ } // v210 input is only compatible with bit-depth of 10 bits void x264_plane_copy_deinterleave_v210_altivec( uint16_t *dsty, intptr_t i_dsty, uint16_t *dstc, intptr_t i_dstc, uint32_t *src, intptr_t i_src, int w, int h ) { #ifdef WORDS_BIGENDIAN const vec_u8_t masky[3] = { { 0x02, 0x01, 0x05, 0x04, 0x07, 0x06, 0x0A, 0x09, 0x0D, 0x0C, 0x0F, 0x0E, 0x12, 0x11, 0x15, 0x14 }, { 0x07, 0x06, 0x0A, 0x09, 0x0D, 0x0C, 0x0F, 0x0E, 0x12, 0x11, 0x15, 0x14, 0x17, 0x16, 0x1A, 0x19 }, { 0x0D, 0x0C, 0x0F, 0x0E, 0x12, 0x11, 0x15, 0x14, 0x17, 0x16, 0x1A, 0x19, 0x1D, 0x1C, 0x1F, 0x1E } }; const vec_u8_t maskc[3] = { { 0x01, 0x00, 0x03, 0x02, 0x06, 0x05, 0x09, 0x08, 0x0B, 0x0A, 0x0E, 0x0D, 0x11, 0x10, 0x13, 0x12 }, { 0x06, 0x05, 0x09, 0x08, 0x0B, 0x0A, 0x0E, 0x0D, 0x11, 0x10, 0x13, 0x12, 0x16, 0x15, 0x19, 0x18 }, { 0x0B, 0x0A, 0x0E, 0x0D, 0x11, 0x10, 0x13, 0x12, 0x16, 0x15, 0x19, 0x18, 0x1B, 0x1A, 0x1E, 0x1D } }; #else const vec_u8_t masky[3] = { { 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0x11, 0x12, 0x14, 0x15 }, { 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0x11, 0x12, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1A }, { 0x0C, 0x0D, 0x0E, 0x0F, 0x11, 0x12, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1A, 0x1C, 0x1D, 0x1E, 0x1F } }; const vec_u8_t maskc[3] = { { 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x13 }, { 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19 }, { 0x0A, 0x0B, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x1A, 0x1B, 0x1D, 0x1E } }; #endif const vec_u16_t shift[3] = { { 0, 4, 2, 0, 4, 2, 0, 4 }, { 2, 0, 4, 2, 0, 4, 2, 0 }, { 4, 2, 0, 4, 2, 0, 4, 2 } }; vec_u16_t dstv; vec_u16_t and_mask = vec_sub( vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 10 ) ), vec_splat_u16( 1 ) ); vec_u32_t srcv[4]; for( int i = 0; i < h; i++ ) { int offset = 0; int s = 0; for( int j = 0; j < w; j += 24 ) { LOAD_SRC( 0 ); STORE_8( maskc[0], shift[0], dstc, 0, 1 ); STORE_8( masky[0], shift[1], dsty, 0, 1 ); offset += 16; LOAD_SRC( 2 ); STORE_8( maskc[1], shift[1], dstc, 1, 2 ); STORE_8( masky[1], shift[2], dsty, 1, 2 ); offset += 16; STORE_8( maskc[2], shift[2], dstc, 2, 3 ); STORE_8( masky[2], shift[0], dsty, 2, 3 ); offset += 16; } dsty += i_dsty; dstc += i_dstc; src += i_src; } } #endif // HIGH_BIT_DEPTH void x264_mc_init_altivec( x264_mc_functions_t *pf ) { #if HIGH_BIT_DEPTH pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_altivec; #else // !HIGH_BIT_DEPTH pf->mc_luma = mc_luma_altivec; pf->get_ref = get_ref_altivec; pf->mc_chroma = mc_chroma_altivec; pf->copy_16x16_unaligned = mc_copy_w16_altivec; pf->copy[PIXEL_16x16] = mc_copy_w16_aligned_altivec; pf->hpel_filter = x264_hpel_filter_altivec; pf->frame_init_lowres_core = frame_init_lowres_core_altivec; pf->weight = mc_weight_wtab_altivec; pf->plane_copy_swap = plane_copy_swap_altivec; pf->plane_copy_interleave = plane_copy_interleave_altivec; pf->store_interleave_chroma = x264_store_interleave_chroma_altivec; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_altivec; pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc_altivec; #if HAVE_VSX pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_altivec; #endif // HAVE_VSX #endif // !HIGH_BIT_DEPTH } x264-master/common/ppc/mc.h000066400000000000000000000025461502133446700157100ustar00rootroot00000000000000/***************************************************************************** * mc.h: ppc motion compensation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Eric Petit * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PPC_MC_H #define X264_PPC_MC_H #define x264_mc_init_altivec x264_template(mc_init_altivec) void x264_mc_init_altivec( x264_mc_functions_t *pf ); #endif x264-master/common/ppc/pixel.c000066400000000000000000002001321502133446700164140ustar00rootroot00000000000000/***************************************************************************** * pixel.c: ppc pixel metrics ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Eric Petit * Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "ppccommon.h" #include "pixel.h" #if !HIGH_BIT_DEPTH /*********************************************************************** * SAD routines **********************************************************************/ #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \ static int name( uint8_t *pix1, intptr_t i_pix1, \ uint8_t *pix2, intptr_t i_pix2 ) \ { \ ALIGNED_16( int sum ); \ \ LOAD_ZERO; \ vec_u8_t pix1v, pix2v; \ vec_s32_t sumv = zero_s32v; \ for( int y = 0; y < ly; y++ ) \ { \ pix1v = vec_vsx_ld( 0, pix1 ); \ pix2v = vec_vsx_ld( 0, pix2 ); \ sumv = (vec_s32_t) vec_sum4s( \ vec_absd( pix1v, pix2v ), \ (vec_u32_t) sumv ); \ pix1 += i_pix1; \ pix2 += i_pix2; \ } \ sumv = vec_sum##a( sumv, zero_s32v ); \ sumv = vec_splat( sumv, b ); \ vec_ste( sumv, 0, &sum ); \ return sum; \ } PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s, 3 ) PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 ) PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 ) PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 ) /*********************************************************************** * SATD routines **********************************************************************/ /*********************************************************************** * VEC_HADAMAR *********************************************************************** * b[0] = a[0] + a[1] + a[2] + a[3] * b[1] = a[0] + a[1] - a[2] - a[3] * b[2] = a[0] - a[1] - a[2] + a[3] * b[3] = a[0] - a[1] + a[2] - a[3] **********************************************************************/ #define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \ b2 = vec_add( a0, a1 ); \ b3 = vec_add( a2, a3 ); \ a0 = vec_sub( a0, a1 ); \ a2 = vec_sub( a2, a3 ); \ b0 = vec_add( b2, b3 ); \ b1 = vec_sub( b2, b3 ); \ b2 = vec_sub( a0, a2 ); \ b3 = vec_add( a0, a2 ) /*********************************************************************** * VEC_ABS *********************************************************************** * a: s16v * * a = abs(a) * * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs() * actually also calls vec_splat(0), but we already have a null vector. **********************************************************************/ #define VEC_ABS(a) \ a = vec_max( a, vec_sub( zero_s16v, a ) ); #define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) ) /*********************************************************************** * VEC_ADD_ABS *********************************************************************** * a: s16v * b, c: s32v * * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi] **********************************************************************/ #define VEC_ADD_ABS(a,b,c) \ VEC_ABS( a ); \ c = vec_sum4s( a, b ) static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b, vec_s16_t c, vec_s16_t d ) { vec_s16_t t0 = vec_abs( a ); vec_s16_t t1 = vec_abs( b ); vec_s16_t t2 = vec_abs( c ); vec_s16_t t3 = vec_abs( d ); vec_s16_t s0 = vec_adds( t0, t1 ); vec_s16_t s1 = vec_adds( t2, t3 ); vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) ); vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) ); return vec_add( s01, s23 ); } /*********************************************************************** * SATD 4x4 **********************************************************************/ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v; vec_s16_t temp0v, temp1v, temp2v, temp3v; vec_s32_t satdv; VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); /* Hadamar H */ VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v ); /* Hadamar V */ VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v ); satdv = vec_sum2s( satdv, zero_s32v ); satdv = vec_splat( satdv, 1 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } /*********************************************************************** * SATD 4x8 **********************************************************************/ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v; vec_s16_t temp0v, temp1v, temp2v, temp3v; vec_s32_t satdv; VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); satdv = vec_add( satdv, add_abs_4( temp0v, temp1v, temp2v, temp3v ) ); satdv = vec_sum2s( satdv, zero_s32v ); satdv = vec_splat( satdv, 1 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } static ALWAYS_INLINE vec_s32_t add_abs_8( vec_s16_t a, vec_s16_t b, vec_s16_t c, vec_s16_t d, vec_s16_t e, vec_s16_t f, vec_s16_t g, vec_s16_t h ) { vec_s16_t t0 = vec_abs( a ); vec_s16_t t1 = vec_abs( b ); vec_s16_t t2 = vec_abs( c ); vec_s16_t t3 = vec_abs( d ); vec_s16_t s0 = vec_adds( t0, t1 ); vec_s16_t s1 = vec_adds( t2, t3 ); vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) ); vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) ); vec_s16_t t4 = vec_abs( e ); vec_s16_t t5 = vec_abs( f ); vec_s16_t t6 = vec_abs( g ); vec_s16_t t7 = vec_abs( h ); vec_s16_t s2 = vec_adds( t4, t5 ); vec_s16_t s3 = vec_adds( t6, t7 ); vec_s32_t s0145 = vec_sum4s( s2, s01 ); vec_s32_t s2367 = vec_sum4s( s3, s23 ); return vec_add( s0145, s2367 ); } /*********************************************************************** * SATD 8x4 **********************************************************************/ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; vec_s32_t satdv; VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); /* This causes warnings because temp4v...temp7v haven't be set, but we don't care */ VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_sum2s( satdv, zero_s32v ); satdv = vec_splat( satdv, 1 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } /*********************************************************************** * SATD 8x8 **********************************************************************/ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; vec_s32_t satdv; VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_sums( satdv, zero_s32v ); satdv = vec_splat( satdv, 3 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } /*********************************************************************** * SATD 8x16 **********************************************************************/ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; vec_s32_t satdv; VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ) ); satdv = vec_sums( satdv, zero_s32v ); satdv = vec_splat( satdv, 3 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } /*********************************************************************** * SATD 16x8 **********************************************************************/ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); LOAD_ZERO; vec_s32_t satdv; vec_s16_t pix1v, pix2v; vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v; vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v ); VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v ); VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v ); satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ); VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v ); VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ) ); satdv = vec_sums( satdv, zero_s32v ); satdv = vec_splat( satdv, 3 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } /*********************************************************************** * SATD 16x16 **********************************************************************/ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); LOAD_ZERO; vec_s32_t satdv; vec_s16_t pix1v, pix2v; vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v; vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v ); VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v ); VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v ); satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ); VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v ); VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ) ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v ); VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v ); VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ) ); VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v ); VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v ); VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v ); satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v ) ); satdv = vec_sums( satdv, zero_s32v ); satdv = vec_splat( satdv, 3 ); vec_ste( satdv, 0, &i_satd ); return i_satd >> 1; } /*********************************************************************** * Interleaved SAD routines **********************************************************************/ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, intptr_t i_stride, int scores[4] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); ALIGNED_16( int sum2 ); ALIGNED_16( int sum3 ); LOAD_ZERO; vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v; vec_s32_t sum0v, sum1v, sum2v, sum3v; sum0v = vec_splat_s32(0); sum1v = vec_splat_s32(0); sum2v = vec_splat_s32(0); sum3v = vec_splat_s32(0); for( int y = 0; y < 8; y++ ) { pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; pix1v = vec_vsx_ld( 0, pix1 ); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; pix1v = vec_vsx_ld( 0, pix1 ); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); } sum0v = vec_sums( sum0v, zero_s32v ); sum1v = vec_sums( sum1v, zero_s32v ); sum2v = vec_sums( sum2v, zero_s32v ); sum3v = vec_sums( sum3v, zero_s32v ); sum0v = vec_splat( sum0v, 3 ); sum1v = vec_splat( sum1v, 3 ); sum2v = vec_splat( sum2v, 3 ); sum3v = vec_splat( sum3v, 3 ); vec_ste( sum0v, 0, &sum0); vec_ste( sum1v, 0, &sum1); vec_ste( sum2v, 0, &sum2); vec_ste( sum3v, 0, &sum3); scores[0] = sum0; scores[1] = sum1; scores[2] = sum2; scores[3] = sum3; } static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, intptr_t i_stride, int scores[3] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); ALIGNED_16( int sum2 ); LOAD_ZERO; vec_u8_t fencv, pix0v, pix1v, pix2v; vec_s32_t sum0v, sum1v, sum2v; sum0v = vec_splat_s32(0); sum1v = vec_splat_s32(0); sum2v = vec_splat_s32(0); for( int y = 0; y < 8; y++ ) { pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; pix1v = vec_vsx_ld( 0, pix1 ); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; pix1v = vec_vsx_ld( 0, pix1 ); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); } sum0v = vec_sums( sum0v, zero_s32v ); sum1v = vec_sums( sum1v, zero_s32v ); sum2v = vec_sums( sum2v, zero_s32v ); sum0v = vec_splat( sum0v, 3 ); sum1v = vec_splat( sum1v, 3 ); sum2v = vec_splat( sum2v, 3 ); vec_ste( sum0v, 0, &sum0); vec_ste( sum1v, 0, &sum1); vec_ste( sum2v, 0, &sum2); scores[0] = sum0; scores[1] = sum1; scores[2] = sum2; } static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, intptr_t i_stride, int scores[4] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); ALIGNED_16( int sum2 ); ALIGNED_16( int sum3 ); LOAD_ZERO; vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v; vec_s32_t sum0v, sum1v, sum2v, sum3v; sum0v = vec_splat_s32(0); sum1v = vec_splat_s32(0); sum2v = vec_splat_s32(0); sum3v = vec_splat_s32(0); for( int y = 0; y < 4; y++ ) { pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; pix1v = vec_vsx_ld( 0, pix1 ); pix1 += i_stride; fencv = vec_ld( 0, fenc ); fenc += FENC_STRIDE; pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; pix1v = vec_vsx_ld( 0, pix1 ); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); } sum0v = vec_sums( sum0v, zero_s32v ); sum1v = vec_sums( sum1v, zero_s32v ); sum2v = vec_sums( sum2v, zero_s32v ); sum3v = vec_sums( sum3v, zero_s32v ); sum0v = vec_splat( sum0v, 3 ); sum1v = vec_splat( sum1v, 3 ); sum2v = vec_splat( sum2v, 3 ); sum3v = vec_splat( sum3v, 3 ); vec_ste( sum0v, 0, &sum0); vec_ste( sum1v, 0, &sum1); vec_ste( sum2v, 0, &sum2); vec_ste( sum3v, 0, &sum3); scores[0] = sum0; scores[1] = sum1; scores[2] = sum2; scores[3] = sum3; } #define PROCESS_PIXS \ vec_u8_t pix0vH = vec_vsx_ld( 0, pix0 ); \ pix0 += i_stride; \ \ vec_u8_t pix1vH = vec_vsx_ld( 0, pix1 ); \ pix1 += i_stride; \ \ vec_u8_t fencvH = vec_vsx_ld( 0, fenc ); \ fenc += FENC_STRIDE; \ \ vec_u8_t pix2vH = vec_vsx_ld( 0, pix2 ); \ pix2 += i_stride; \ \ vec_u8_t pix0vL = vec_vsx_ld( 0, pix0 ); \ pix0 += i_stride; \ \ vec_u8_t pix1vL = vec_vsx_ld( 0, pix1 ); \ pix1 += i_stride; \ \ vec_u8_t fencvL = vec_vsx_ld( 0, fenc ); \ fenc += FENC_STRIDE; \ \ vec_u8_t pix2vL = vec_vsx_ld( 0, pix2 ); \ pix2 += i_stride; \ \ fencv = xxpermdi( fencvH, fencvL, 0 ); \ pix0v = xxpermdi( pix0vH, pix0vL, 0 ); \ pix1v = xxpermdi( pix1vH, pix1vL, 0 ); \ pix2v = xxpermdi( pix2vH, pix2vL, 0 ); \ \ sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); \ sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); \ sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); #define PIXEL_SAD_X3_ALTIVEC( name, ly ) \ static void name( uint8_t *fenc, uint8_t *pix0, \ uint8_t *pix1, uint8_t *pix2, \ intptr_t i_stride, int scores[3] ) \ { \ ALIGNED_16( int sum0 ); \ ALIGNED_16( int sum1 ); \ ALIGNED_16( int sum2 ); \ \ LOAD_ZERO; \ vec_u8_t fencv, pix0v, pix1v, pix2v; \ vec_s32_t sum0v, sum1v, sum2v; \ \ sum0v = vec_splat_s32( 0 ); \ sum1v = vec_splat_s32( 0 ); \ sum2v = vec_splat_s32( 0 ); \ \ for( int y = 0; y < ly; y++ ) \ { \ PROCESS_PIXS \ } \ \ sum0v = vec_sums( sum0v, zero_s32v ); \ sum1v = vec_sums( sum1v, zero_s32v ); \ sum2v = vec_sums( sum2v, zero_s32v ); \ \ sum0v = vec_splat( sum0v, 3 ); \ sum1v = vec_splat( sum1v, 3 ); \ sum2v = vec_splat( sum2v, 3 ); \ \ vec_ste( sum0v, 0, &sum0 ); \ vec_ste( sum1v, 0, &sum1 ); \ vec_ste( sum2v, 0, &sum2 ); \ \ scores[0] = sum0; \ scores[1] = sum1; \ scores[2] = sum2; \ } PIXEL_SAD_X3_ALTIVEC( pixel_sad_x3_8x8_altivec, 4 ) PIXEL_SAD_X3_ALTIVEC( pixel_sad_x3_8x16_altivec, 8 ) static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, intptr_t i_stride, int scores[3] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); ALIGNED_16( int sum2 ); LOAD_ZERO; vec_u8_t fencv, pix0v, pix1v, pix2v; vec_s32_t sum0v, sum1v, sum2v; sum0v = vec_splat_s32(0); sum1v = vec_splat_s32(0); sum2v = vec_splat_s32(0); for( int y = 0; y < 4; y++ ) { pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; pix1v = vec_vsx_ld(0, pix1); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; pix1v = vec_vsx_ld(0, pix1); pix1 += i_stride; fencv = vec_ld(0, fenc); fenc += FENC_STRIDE; pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); } sum0v = vec_sums( sum0v, zero_s32v ); sum1v = vec_sums( sum1v, zero_s32v ); sum2v = vec_sums( sum2v, zero_s32v ); sum0v = vec_splat( sum0v, 3 ); sum1v = vec_splat( sum1v, 3 ); sum2v = vec_splat( sum2v, 3 ); vec_ste( sum0v, 0, &sum0); vec_ste( sum1v, 0, &sum1); vec_ste( sum2v, 0, &sum2); scores[0] = sum0; scores[1] = sum1; scores[2] = sum2; } #define PIXEL_SAD_X4_ALTIVEC( name, ly ) \ static void name( uint8_t *fenc, \ uint8_t *pix0, uint8_t *pix1, \ uint8_t *pix2, uint8_t *pix3, \ intptr_t i_stride, int scores[4] ) \ { \ ALIGNED_16( int sum0 ); \ ALIGNED_16( int sum1 ); \ ALIGNED_16( int sum2 ); \ \ LOAD_ZERO; \ vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v; \ vec_s32_t sum0v, sum1v, sum2v, sum3v; \ \ sum0v = vec_splat_s32( 0 ); \ sum1v = vec_splat_s32( 0 ); \ sum2v = vec_splat_s32( 0 ); \ \ for( int y = 0; y < ly; y++ ) \ { \ PROCESS_PIXS \ vec_u8_t pix3vH = vec_vsx_ld( 0, pix3 ); \ pix3 += i_stride; \ vec_u8_t pix3vL = vec_vsx_ld( 0, pix3 ); \ pix3 += i_stride; \ pix3v = xxpermdi( pix3vH, pix3vL, 0 ); \ sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); \ } \ \ sum0v = vec_sums( sum0v, zero_s32v ); \ sum1v = vec_sums( sum1v, zero_s32v ); \ sum2v = vec_sums( sum2v, zero_s32v ); \ sum3v = vec_sums( sum3v, zero_s32v ); \ \ vec_s32_t s01 = vec_mergel( sum0v, sum1v ); \ vec_s32_t s23 = vec_mergel( sum2v, sum3v ); \ vec_s32_t s = xxpermdi( s01, s23, 3 ); \ \ vec_vsx_st( s, 0, scores ); \ } PIXEL_SAD_X4_ALTIVEC( pixel_sad_x4_8x8_altivec, 4 ) PIXEL_SAD_X4_ALTIVEC( pixel_sad_x4_8x16_altivec, 8 ) /*********************************************************************** * SSD routines **********************************************************************/ static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1, uint8_t *pix2, intptr_t i_stride_pix2 ) { ALIGNED_16( int sum ); LOAD_ZERO; vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB; vec_u32_t sumv; vec_u8_t diffA, diffB; sumv = vec_splat_u32(0); pix2vA = vec_vsx_ld(0, pix2); pix1vA = vec_ld(0, pix1); for( int y = 0; y < 7; y++ ) { pix1 += i_stride_pix1; pix2 += i_stride_pix2; pix2vB = vec_vsx_ld(0, pix2); pix1vB = vec_ld(0, pix1); diffA = vec_absd(pix1vA, pix2vA); sumv = vec_msum(diffA, diffA, sumv); pix1 += i_stride_pix1; pix2 += i_stride_pix2; pix2vA = vec_vsx_ld(0, pix2); pix1vA = vec_ld(0, pix1); diffB = vec_absd(pix1vB, pix2vB); sumv = vec_msum(diffB, diffB, sumv); } pix1 += i_stride_pix1; pix2 += i_stride_pix2; pix2vB = vec_vsx_ld(0, pix2); pix1vB = vec_ld(0, pix1); diffA = vec_absd(pix1vA, pix2vA); sumv = vec_msum(diffA, diffA, sumv); diffB = vec_absd(pix1vB, pix2vB); sumv = vec_msum(diffB, diffB, sumv); sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v); sumv = vec_splat(sumv, 3); vec_ste((vec_s32_t) sumv, 0, &sum); return sum; } static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1, uint8_t *pix2, intptr_t i_stride_pix2 ) { ALIGNED_16( int sum ); LOAD_ZERO; vec_u8_t pix1v, pix2v; vec_u32_t sumv; vec_u8_t diffv; const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0); sumv = vec_splat_u32(0); for( int y = 0; y < 8; y++ ) { pix1v = vec_vsx_ld(0, pix1); pix2v = vec_vsx_ld(0, pix2); diffv = vec_absd( pix1v, pix2v ); sumv = vec_msum(diffv, diffv, sumv); pix1 += i_stride_pix1; pix2 += i_stride_pix2; } sumv = vec_sel( zero_u32v, sumv, sel ); sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v); sumv = vec_splat(sumv, 3); vec_ste((vec_s32_t) sumv, 0, &sum); return sum; } /**************************************************************************** * variance ****************************************************************************/ static uint64_t pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride ) { ALIGNED_16(uint32_t sum_tab[4]); ALIGNED_16(uint32_t sqr_tab[4]); LOAD_ZERO; vec_u32_t sqr_v = zero_u32v; vec_u32_t sum_v = zero_u32v; for( int y = 0; y < 16; y++ ) { vec_u8_t pix0_v = vec_ld(0, pix); sum_v = vec_sum4s(pix0_v, sum_v); sqr_v = vec_msum(pix0_v, pix0_v, sqr_v); pix += i_stride; } sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v ); sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v ); vec_ste(sum_v, 12, sum_tab); vec_ste(sqr_v, 12, sqr_tab); uint32_t sum = sum_tab[3]; uint32_t sqr = sqr_tab[3]; return sum + ((uint64_t)sqr<<32); } static uint64_t pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride ) { ALIGNED_16(uint32_t sum_tab[4]); ALIGNED_16(uint32_t sqr_tab[4]); LOAD_ZERO; vec_u32_t sqr_v = zero_u32v; vec_u32_t sum_v = zero_u32v; static const vec_u8_t perm_tab[] = { CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* pix=mod16, i_stride=mod16 */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17), CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, /* pix=mod8, i_stride=mod16 */ 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F), }; vec_u8_t perm = perm_tab[ ((uintptr_t)pix & 8) >> 3 ]; for( int y = 0; y < 4; y++ ) { vec_u8_t pix0_v = vec_ld(0, pix); vec_u8_t pix1_v = vec_ld(i_stride, pix); vec_u8_t pix_v = vec_perm(pix0_v, pix1_v, perm); sum_v = vec_sum4s(pix_v, sum_v); sqr_v = vec_msum(pix_v, pix_v, sqr_v); pix += i_stride<<1; } sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v ); sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v ); vec_ste(sum_v, 12, sum_tab); vec_ste(sqr_v, 12, sqr_tab); uint32_t sum = sum_tab[3]; uint32_t sqr = sqr_tab[3]; return sum + ((uint64_t)sqr<<32); } /********************************************************************** * SA8D routines: sum of 8x8 Hadamard transformed differences **********************************************************************/ /* SA8D_1D unrolled by 8 in Altivec */ #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, \ sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \ { \ /* int a0 = SRC(0) + SRC(4) */ \ vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \ /* int a4 = SRC(0) - SRC(4) */ \ vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \ /* int a1 = SRC(1) + SRC(5) */ \ vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \ /* int a5 = SRC(1) - SRC(5) */ \ vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \ /* int a2 = SRC(2) + SRC(6) */ \ vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \ /* int a6 = SRC(2) - SRC(6) */ \ vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \ /* int a3 = SRC(3) + SRC(7) */ \ vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \ /* int a7 = SRC(3) - SRC(7) */ \ vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \ \ /* int b0 = a0 + a2 */ \ vec_s16_t b0v = vec_add(a0v, a2v); \ /* int b2 = a0 - a2; */ \ vec_s16_t b2v = vec_sub(a0v, a2v); \ /* int b1 = a1 + a3; */ \ vec_s16_t b1v = vec_add(a1v, a3v); \ /* int b3 = a1 - a3; */ \ vec_s16_t b3v = vec_sub(a1v, a3v); \ /* int b4 = a4 + a6; */ \ vec_s16_t b4v = vec_add(a4v, a6v); \ /* int b6 = a4 - a6; */ \ vec_s16_t b6v = vec_sub(a4v, a6v); \ /* int b5 = a5 + a7; */ \ vec_s16_t b5v = vec_add(a5v, a7v); \ /* int b7 = a5 - a7; */ \ vec_s16_t b7v = vec_sub(a5v, a7v); \ \ /* DST(0, b0 + b1) */ \ sa8d0v = vec_add(b0v, b1v); \ /* DST(1, b0 - b1) */ \ sa8d1v = vec_sub(b0v, b1v); \ /* DST(2, b2 + b3) */ \ sa8d2v = vec_add(b2v, b3v); \ /* DST(3, b2 - b3) */ \ sa8d3v = vec_sub(b2v, b3v); \ /* DST(4, b4 + b5) */ \ sa8d4v = vec_add(b4v, b5v); \ /* DST(5, b4 - b5) */ \ sa8d5v = vec_sub(b4v, b5v); \ /* DST(6, b6 + b7) */ \ sa8d6v = vec_add(b6v, b7v); \ /* DST(7, b6 - b7) */ \ sa8d7v = vec_sub(b6v, b7v); \ } static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { int32_t i_satd=0; PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v; SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v); VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v, sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v ); SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v ); /* accumulation of the absolute value of all elements of the resulting block */ vec_s16_t abs0v = VEC_ABS(sa8d0v); vec_s16_t abs1v = VEC_ABS(sa8d1v); vec_s16_t sum01v = vec_add(abs0v, abs1v); vec_s16_t abs2v = VEC_ABS(sa8d2v); vec_s16_t abs3v = VEC_ABS(sa8d3v); vec_s16_t sum23v = vec_add(abs2v, abs3v); vec_s16_t abs4v = VEC_ABS(sa8d4v); vec_s16_t abs5v = VEC_ABS(sa8d5v); vec_s16_t sum45v = vec_add(abs4v, abs5v); vec_s16_t abs6v = VEC_ABS(sa8d6v); vec_s16_t abs7v = VEC_ABS(sa8d7v); vec_s16_t sum67v = vec_add(abs6v, abs7v); vec_s16_t sum0123v = vec_add(sum01v, sum23v); vec_s16_t sum4567v = vec_add(sum45v, sum67v); vec_s32_t sumblocv; sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov ); sumblocv = vec_sum4s(sum4567v, sumblocv ); sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov ); sumblocv = vec_splat(sumblocv, 3); vec_ste(sumblocv, 0, &i_satd); return i_satd; } static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { int32_t i_satd; i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2; return i_satd; } static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1, uint8_t *pix2, intptr_t i_pix2 ) { int32_t i_satd; i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 ) + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ) + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2; return i_satd; } #define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\ vec_s16_t t0 = vec_add(s0, s1); \ vec_s16_t t1 = vec_sub(s0, s1); \ vec_s16_t t2 = vec_add(s2, s3); \ vec_s16_t t3 = vec_sub(s2, s3); \ d0 = vec_add(t0, t2); \ d2 = vec_sub(t0, t2); \ d1 = vec_add(t1, t3); \ d3 = vec_sub(t1, t3); \ } #ifdef WORDS_BIGENDIAN #define vec_perm_extend_s16(val, perm) (vec_s16_t)vec_perm(val, zero_u8v, perm) #else #define vec_perm_extend_s16(val, perm) (vec_s16_t)vec_perm(zero_u8v, val, perm) #endif #define VEC_LOAD_HIGH( p, num ) \ vec_u8_t pix8_##num = vec_ld( stride*num, p ); \ vec_s16_t pix16_s##num = vec_perm_extend_s16( pix8_##num, perm ); \ vec_s16_t pix16_d##num; static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm ) { ALIGNED_16( int32_t sum4_tab[4] ); ALIGNED_16( int32_t sum8_tab[4] ); LOAD_ZERO; VEC_LOAD_HIGH( pix, 0 ); VEC_LOAD_HIGH( pix, 1 ); VEC_LOAD_HIGH( pix, 2 ); VEC_LOAD_HIGH( pix, 3 ); HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3, pix16_s0,pix16_s1,pix16_s2,pix16_s3); VEC_LOAD_HIGH( pix, 4 ); VEC_LOAD_HIGH( pix, 5 ); VEC_LOAD_HIGH( pix, 6 ); VEC_LOAD_HIGH( pix, 7 ); HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7, pix16_s4,pix16_s5,pix16_s6,pix16_s7); VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3, pix16_d4, pix16_d5, pix16_d6, pix16_d7, pix16_s0, pix16_s1, pix16_s2, pix16_s3, pix16_s4, pix16_s5, pix16_s6, pix16_s7); HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3, pix16_s0,pix16_s1,pix16_s2,pix16_s3); HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7, pix16_s4,pix16_s5,pix16_s6,pix16_s7); vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) ); vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) ); vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) ); vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) ); vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67)); vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum4_v, zero_s32v), zero_s32v), 12, sum4_tab); vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4); vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4); vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5); vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5); vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6); vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6); vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7); vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7); int sum4 = sum4_tab[3]; VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3, tmpi4, tmpi5, tmpi6, tmpi7, pix16_d0, pix16_d1, pix16_d2, pix16_d3, pix16_d4, pix16_d5, pix16_d6, pix16_d7); vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ), VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) ); vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ), VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) ); vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ), VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) ); vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ), VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) ); vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) ); vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum8_v, zero_s32v), zero_s32v), 12, sum8_tab); int sum8 = sum8_tab[3]; ALIGNED_16( int16_t tmp0_4_tab[8] ); vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab); sum4 -= tmp0_4_tab[0]; sum8 -= tmp0_4_tab[0]; return ((uint64_t)sum8<<32) + sum4; } static const vec_u8_t hadamard_permtab[] = { CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03, /* pix = mod16 */ 0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ), CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B, /* pix = mod8 */ 0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F ) }; static uint64_t pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride ) { int idx = ((uintptr_t)pix & 8) >> 3; vec_u8_t permh = hadamard_permtab[idx]; vec_u8_t perml = hadamard_permtab[!idx]; uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh ); sum += pixel_hadamard_ac_altivec( pix+8, stride, perml ); sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh ); sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml ); return ((sum>>34)<<32) + ((uint32_t)sum>>1); } static uint64_t pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride ) { int idx = ((uintptr_t)pix & 8) >> 3; vec_u8_t permh = hadamard_permtab[idx]; vec_u8_t perml = hadamard_permtab[!idx]; uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh ); sum += pixel_hadamard_ac_altivec( pix+8, stride, perml ); return ((sum>>34)<<32) + ((uint32_t)sum>>1); } static uint64_t pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride ) { vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ]; uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm ); sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm ); return ((sum>>34)<<32) + ((uint32_t)sum>>1); } static uint64_t pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride ) { vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ]; uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm ); return ((sum>>34)<<32) + ((uint32_t)sum>>1); } /**************************************************************************** * structural similarity metric ****************************************************************************/ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1, const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) { ALIGNED_16( int temp[4] ); vec_u8_t pix1v, pix2v; vec_u32_t s1v, s2v, ssv, s12v; LOAD_ZERO; s1v = s2v = ssv = s12v = zero_u32v; for( int y = 0; y < 4; y++ ) { pix1v = vec_vsx_ld( y*stride1, pix1 ); pix2v = vec_vsx_ld( y*stride2, pix2 ); s1v = vec_sum4s( pix1v, s1v ); s2v = vec_sum4s( pix2v, s2v ); ssv = vec_msum( pix1v, pix1v, ssv ); ssv = vec_msum( pix2v, pix2v, ssv ); s12v = vec_msum( pix1v, pix2v, s12v ); } vec_st( (vec_s32_t)s1v, 0, temp ); sums[0][0] = temp[0]; sums[1][0] = temp[1]; vec_st( (vec_s32_t)s2v, 0, temp ); sums[0][1] = temp[0]; sums[1][1] = temp[1]; vec_st( (vec_s32_t)ssv, 0, temp ); sums[0][2] = temp[0]; sums[1][2] = temp[1]; vec_st( (vec_s32_t)s12v, 0, temp ); sums[0][3] = temp[0]; sums[1][3] = temp[1]; } #define SATD_X( size ) \ static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\ intptr_t i_stride, int scores[3] )\ {\ scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ }\ static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\ uint8_t *pix3, intptr_t i_stride, int scores[4] )\ {\ scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\ } SATD_X( 16x16 )\ SATD_X( 16x8 )\ SATD_X( 8x16 )\ SATD_X( 8x8 )\ SATD_X( 8x4 )\ SATD_X( 4x8 )\ SATD_X( 4x4 ) #define INTRA_MBCMP_8x8( mbcmp )\ static void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\ {\ ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\ x264_predict_8x8_v_c( pix, edge );\ res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_8x8_h_c( pix, edge );\ res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_8x8_dc_c( pix, edge );\ res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\ } INTRA_MBCMP_8x8(sad) INTRA_MBCMP_8x8(sa8d) #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\ static void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\ {\ x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\ res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\ res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\ res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\ } INTRA_MBCMP(satd, 4, v, h, dc, ) INTRA_MBCMP(sad, 8, dc, h, v, c ) INTRA_MBCMP(satd, 8, dc, h, v, c ) INTRA_MBCMP(sad, 16, v, h, dc, ) INTRA_MBCMP(satd, 16, v, h, dc, ) #endif // !HIGH_BIT_DEPTH /**************************************************************************** * x264_pixel_init: ****************************************************************************/ void x264_pixel_init_altivec( x264_pixel_function_t *pixf ) { #if !HIGH_BIT_DEPTH pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec; pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec; pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec; pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec; pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec; pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec; pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec; pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec; pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec; pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec; pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec; pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec; pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec; pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec; pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec; pixf->satd[PIXEL_8x8] = pixel_satd_8x8_altivec; pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec; pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec; pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec; pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec; pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_altivec; pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_altivec; pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_altivec; pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_altivec; pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_altivec; pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_altivec; pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec; pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_altivec; pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_altivec; pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_altivec; pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_altivec; pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_altivec; pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_altivec; pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_altivec; pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_altivec; pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_altivec; pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_altivec; pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_altivec; pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec; pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec; pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec; pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec; pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec; pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_altivec; pixf->var[PIXEL_16x16] = pixel_var_16x16_altivec; pixf->var[PIXEL_8x8] = pixel_var_8x8_altivec; pixf->hadamard_ac[PIXEL_16x16] = pixel_hadamard_ac_16x16_altivec; pixf->hadamard_ac[PIXEL_16x8] = pixel_hadamard_ac_16x8_altivec; pixf->hadamard_ac[PIXEL_8x16] = pixel_hadamard_ac_8x16_altivec; pixf->hadamard_ac[PIXEL_8x8] = pixel_hadamard_ac_8x8_altivec; pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec; #endif // !HIGH_BIT_DEPTH } x264-master/common/ppc/pixel.h000066400000000000000000000025661502133446700164340ustar00rootroot00000000000000/***************************************************************************** * pixel.h: ppc pixel metrics ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Eric Petit * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PPC_PIXEL_H #define X264_PPC_PIXEL_H #define x264_pixel_init_altivec x264_template(pixel_init_altivec) void x264_pixel_init_altivec( x264_pixel_function_t *pixf ); #endif x264-master/common/ppc/ppccommon.h000066400000000000000000000311341502133446700172770ustar00rootroot00000000000000/***************************************************************************** * ppccommon.h: ppc utility macros ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Eric Petit * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #if HAVE_ALTIVEC_H #include #endif /*********************************************************************** * For constant vectors, use parentheses on OS X and braces on Linux **********************************************************************/ #if defined(__APPLE__) && __GNUC__ < 4 #define CV(a...) (a) #else #define CV(a...) {a} #endif /*********************************************************************** * Vector types **********************************************************************/ #define vec_u8_t vector unsigned char #define vec_s8_t vector signed char #define vec_u16_t vector unsigned short #define vec_s16_t vector signed short #define vec_u32_t vector unsigned int #define vec_s32_t vector signed int #if HAVE_VSX #define vec_u64_t vector unsigned long long #define vec_s64_t vector signed long long typedef union { uint64_t s[2]; vec_u64_t v; } vec_u64_u; typedef union { int64_t s[2]; vec_s64_t v; } vec_s64_u; #endif typedef union { uint32_t s[4]; vec_u32_t v; } vec_u32_u; typedef union { int32_t s[4]; vec_s32_t v; } vec_s32_u; typedef union { uint16_t s[8]; vec_u16_t v; } vec_u16_u; typedef union { int16_t s[8]; vec_s16_t v; } vec_s16_u; typedef union { uint8_t s[16]; vec_u8_t v; } vec_u8_u; typedef union { int8_t s[16]; vec_s8_t v; } vec_s8_u; /*********************************************************************** * Null vector **********************************************************************/ #define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 ) #define zero_u8v (vec_u8_t) zerov #define zero_s8v (vec_s8_t) zerov #define zero_u16v (vec_u16_t) zerov #define zero_s16v (vec_s16_t) zerov #define zero_u32v (vec_u32_t) zerov #define zero_s32v (vec_s32_t) zerov /*********************************************************************** * 8 <-> 16 bits conversions **********************************************************************/ #ifdef WORDS_BIGENDIAN #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v ) #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v ) #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v ) #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v ) #else #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v ) #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v ) #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v ) #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v ) #endif #define vec_u8_to_u16(v) vec_u8_to_u16_h(v) #define vec_u8_to_s16(v) vec_u8_to_s16_h(v) #define vec_u16_to_u8(v) vec_pack( v, zero_u16v ) #define vec_s16_to_u8(v) vec_packsu( v, zero_s16v ) /*********************************************************************** * 16 <-> 32 bits conversions **********************************************************************/ #ifdef WORDS_BIGENDIAN #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v ) #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v ) #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v ) #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v ) #else #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v ) #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v ) #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v ) #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v ) #endif #define vec_u16_to_u32(v) vec_u16_to_u32_h(v) #define vec_u16_to_s32(v) vec_u16_to_s32_h(v) #define vec_u32_to_u16(v) vec_pack( v, zero_u32v ) #define vec_s32_to_u16(v) vec_packsu( v, zero_s32v ) /*********************************************************************** * VEC_STORE##n: stores n bytes from vector v to address p **********************************************************************/ #ifndef __POWER9_VECTOR__ #define VEC_STORE8( v, p ) \ vec_vsx_st( vec_xxpermdi( v, vec_vsx_ld( 0, p ), 1 ), 0, p ) #else #define VEC_STORE8( v, p ) vec_xst_len( v, p, 8 ) #endif /*********************************************************************** * VEC_TRANSPOSE_8 *********************************************************************** * Transposes a 8x8 matrix of s16 vectors **********************************************************************/ #define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ b0 = vec_mergeh( a0, a4 ); \ b1 = vec_mergel( a0, a4 ); \ b2 = vec_mergeh( a1, a5 ); \ b3 = vec_mergel( a1, a5 ); \ b4 = vec_mergeh( a2, a6 ); \ b5 = vec_mergel( a2, a6 ); \ b6 = vec_mergeh( a3, a7 ); \ b7 = vec_mergel( a3, a7 ); \ a0 = vec_mergeh( b0, b4 ); \ a1 = vec_mergel( b0, b4 ); \ a2 = vec_mergeh( b1, b5 ); \ a3 = vec_mergel( b1, b5 ); \ a4 = vec_mergeh( b2, b6 ); \ a5 = vec_mergel( b2, b6 ); \ a6 = vec_mergeh( b3, b7 ); \ a7 = vec_mergel( b3, b7 ); \ b0 = vec_mergeh( a0, a4 ); \ b1 = vec_mergel( a0, a4 ); \ b2 = vec_mergeh( a1, a5 ); \ b3 = vec_mergel( a1, a5 ); \ b4 = vec_mergeh( a2, a6 ); \ b5 = vec_mergel( a2, a6 ); \ b6 = vec_mergeh( a3, a7 ); \ b7 = vec_mergel( a3, a7 ) /*********************************************************************** * VEC_TRANSPOSE_4 *********************************************************************** * Transposes a 4x4 matrix of s16 vectors. * Actually source and destination are 8x4. The low elements of the * source are discarded and the low elements of the destination mustn't * be used. **********************************************************************/ #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ b0 = vec_mergeh( a0, a0 ); \ b1 = vec_mergeh( a1, a0 ); \ b2 = vec_mergeh( a2, a0 ); \ b3 = vec_mergeh( a3, a0 ); \ a0 = vec_mergeh( b0, b2 ); \ a1 = vec_mergel( b0, b2 ); \ a2 = vec_mergeh( b1, b3 ); \ a3 = vec_mergel( b1, b3 ); \ b0 = vec_mergeh( a0, a2 ); \ b1 = vec_mergel( a0, a2 ); \ b2 = vec_mergeh( a1, a3 ); \ b3 = vec_mergel( a1, a3 ) /*********************************************************************** * VEC_DIFF_H *********************************************************************** * p1, p2: u8 * * i1, i2, n: int * d: s16v * * Loads n bytes from p1 and p2, do the diff of the high elements into * d, increments p1 and p2 by i1 and i2 into known offset g **********************************************************************/ #define PREP_DIFF \ LOAD_ZERO; \ vec_s16_t pix1v, pix2v; #define VEC_DIFF_H(p1,i1,p2,i2,n,d) \ pix1v = vec_vsx_ld( 0, (int16_t *)p1 ); \ pix1v = vec_u8_to_s16( pix1v ); \ pix2v = vec_vsx_ld( 0, (int16_t *)p2 ); \ pix2v = vec_u8_to_s16( pix2v ); \ d = vec_sub( pix1v, pix2v ); \ p1 += i1; \ p2 += i2 /*********************************************************************** * VEC_DIFF_HL *********************************************************************** * p1, p2: u8 * * i1, i2: int * dh, dl: s16v * * Loads 16 bytes from p1 and p2, do the diff of the high elements into * dh, the diff of the low elements into dl, increments p1 and p2 by i1 * and i2 **********************************************************************/ #define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \ pix1v = (vec_s16_t)vec_ld(0, p1); \ temp0v = vec_u8_to_s16_h( pix1v ); \ temp1v = vec_u8_to_s16_l( pix1v ); \ pix2v = vec_vsx_ld( 0, (int16_t *)p2 ); \ temp2v = vec_u8_to_s16_h( pix2v ); \ temp3v = vec_u8_to_s16_l( pix2v ); \ dh = vec_sub( temp0v, temp2v ); \ dl = vec_sub( temp1v, temp3v ); \ p1 += i1; \ p2 += i2 /*********************************************************************** * VEC_DIFF_H_8BYTE_ALIGNED *********************************************************************** * p1, p2: u8 * * i1, i2, n: int * d: s16v * * Loads n bytes from p1 and p2, do the diff of the high elements into * d, increments p1 and p2 by i1 and i2 * Slightly faster when we know we are loading/diffing 8bytes which * are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s **********************************************************************/ #define PREP_DIFF_8BYTEALIGNED \ LOAD_ZERO; \ vec_s16_t pix1v, pix2v; \ vec_u8_t pix1v8, pix2v8; \ #define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \ pix1v8 = vec_vsx_ld( 0, p1 ); \ pix2v8 = vec_vsx_ld( 0, p2 ); \ pix1v = vec_u8_to_s16( pix1v8 ); \ pix2v = vec_u8_to_s16( pix2v8 ); \ d = vec_sub( pix1v, pix2v); \ p1 += i1; \ p2 += i2; #if !HAVE_VSX #undef vec_vsx_ld #define vec_vsx_ld(off, src) \ vec_perm(vec_ld(off, src), vec_ld(off + 15, src), vec_lvsl(off, src)) #undef vec_vsx_st #define vec_vsx_st(v, off, dst) \ do { \ uint8_t *_dst = (uint8_t*)(dst); \ vec_u8_t _v = (vec_u8_t)(v); \ vec_u8_t _a = vec_ld(off, _dst); \ vec_u8_t _b = vec_ld(off + 15, _dst); \ vec_u8_t _e = vec_perm(_b, _a, vec_lvsl(0, _dst)); \ vec_u8_t _m = vec_lvsr(0, _dst); \ \ vec_st(vec_perm(_v, _e, _m), off + 15, _dst); \ vec_st(vec_perm(_e, _v, _m), off, _dst); \ } while( 0 ) #endif #ifndef __POWER9_VECTOR__ #define vec_absd( a, b ) vec_sub( vec_max( a, b ), vec_min( a, b ) ) #endif // vec_xxpermdi is quite useful but some version of clang do not expose it #if !HAVE_VSX || (defined(__clang__) && __clang_major__ < 6) static const vec_u8_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; static const vec_u8_t xxpermdi1_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; static const vec_u8_t xxpermdi2_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; static const vec_u8_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; #define xxpermdi(a, b, c) vec_perm(a, b, xxpermdi##c##_perm) #elif (defined(__GNUC__) && (__GNUC__ > 6 || (__GNUC__ == 6 && __GNUC_MINOR__ >= 3))) || \ (defined(__clang__) && __clang_major__ >= 7) #define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) #endif // vec_xxpermdi has its endianness bias exposed in early gcc and clang #ifdef WORDS_BIGENDIAN #ifndef xxpermdi #define xxpermdi(a, b, c) vec_xxpermdi(a, b, c) #endif #else #ifndef xxpermdi #define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3) #endif #endif x264-master/common/ppc/predict.c000066400000000000000000000215551502133446700167370ustar00rootroot00000000000000/***************************************************************************** * predict.c: ppc intra prediction ***************************************************************************** * Copyright (C) 2007-2025 x264 project * * Authors: Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "ppccommon.h" #include "predict.h" #include "pixel.h" #if !HIGH_BIT_DEPTH static void predict_8x8c_p_altivec( uint8_t *src ) { int H = 0, V = 0; for( int i = 0; i < 4; i++ ) { H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] ); V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] ); } int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] ); int b = ( 17 * H + 16 ) >> 5; int c = ( 17 * V + 16 ) >> 5; int i00 = a -3*b -3*c + 16; vec_s16_u i00_u, b_u, c_u; i00_u.s[0] = i00; b_u.s[0] = b; c_u.s[0] = c; vec_u16_t val5_v = vec_splat_u16(5); vec_s16_t i00_v, b_v, c_v; i00_v = vec_splat(i00_u.v, 0); b_v = vec_splat(b_u.v, 0); c_v = vec_splat(c_u.v, 0); vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7); vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v); for( int i = 0; i < 8; ++i ) { vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v); vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_0_v); VEC_STORE8(com_sat_v, &src[0]); src += FDEC_STRIDE; add_i0_b_0v = vec_adds(add_i0_b_0v, c_v); } } /**************************************************************************** * 16x16 prediction for intra luma block ****************************************************************************/ static void predict_16x16_p_altivec( uint8_t *src ) { int H = 0, V = 0; for( int i = 1; i <= 8; i++ ) { H += i * ( src[7+i - FDEC_STRIDE ] - src[7-i - FDEC_STRIDE ] ); V += i * ( src[(7+i)*FDEC_STRIDE -1] - src[(7-i)*FDEC_STRIDE -1] ); } int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] ); int b = ( 5 * H + 32 ) >> 6; int c = ( 5 * V + 32 ) >> 6; int i00 = a - b * 7 - c * 7 + 16; vec_s16_u i00_u, b_u, c_u; i00_u.s[0] = i00; b_u.s[0] = b; c_u.s[0] = c; vec_u16_t val5_v = vec_splat_u16(5); vec_s16_t i00_v, b_v, c_v; i00_v = vec_splat(i00_u.v, 0); b_v = vec_splat(b_u.v, 0); c_v = vec_splat(c_u.v, 0); vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7); vec_s16_t b8_v = vec_sl(b_v, vec_splat_u16(3)); vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v); vec_s16_t add_i0_b_8v = vec_adds(b8_v, add_i0_b_0v); for( int y = 0; y < 16; y++ ) { vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v); vec_s16_t shift_8_v = vec_sra(add_i0_b_8v, val5_v); vec_u8_t com_sat_v = vec_packsu(shift_0_v, shift_8_v); vec_st( com_sat_v, 0, &src[0]); src += FDEC_STRIDE; add_i0_b_0v = vec_adds(add_i0_b_0v, c_v); add_i0_b_8v = vec_adds(add_i0_b_8v, c_v); } } #define PREDICT_16x16_DC_ALTIVEC(v) \ for( int i = 0; i < 16; i += 2) \ { \ vec_st(v, 0, src); \ vec_st(v, FDEC_STRIDE, src); \ src += FDEC_STRIDE*2; \ } static void predict_16x16_dc_altivec( uint8_t *src ) { uint32_t dc = 0; for( int i = 0; i < 16; i++ ) { dc += src[-1 + i * FDEC_STRIDE]; dc += src[i - FDEC_STRIDE]; } vec_u8_u v ; v.s[0] = (( dc + 16 ) >> 5); vec_u8_t bc_v = vec_splat(v.v, 0); PREDICT_16x16_DC_ALTIVEC(bc_v); } static void predict_16x16_dc_left_altivec( uint8_t *src ) { uint32_t dc = 0; for( int i = 0; i < 16; i++ ) dc += src[-1 + i * FDEC_STRIDE]; vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4); vec_u8_t bc_v = vec_splat(v.v, 0); PREDICT_16x16_DC_ALTIVEC(bc_v); } static void predict_16x16_dc_top_altivec( uint8_t *src ) { uint32_t dc = 0; for( int i = 0; i < 16; i++ ) dc += src[i - FDEC_STRIDE]; vec_u8_u v ; v.s[0] = (( dc + 8 ) >> 4); vec_u8_t bc_v = vec_splat(v.v, 0); PREDICT_16x16_DC_ALTIVEC(bc_v); } static void predict_16x16_dc_128_altivec( uint8_t *src ) { /* test if generating the constant is faster than loading it. vector unsigned int bc_v = (vector unsigned int)CV(0x80808080, 0x80808080, 0x80808080, 0x80808080); */ vec_u8_t bc_v = vec_vslb((vec_u8_t)vec_splat_u8(1),(vec_u8_t)vec_splat_u8(7)); PREDICT_16x16_DC_ALTIVEC(bc_v); } static void predict_16x16_h_altivec( uint8_t *src ) { vec_u8_t v1 = vec_ld( -1, src ); vec_u8_t v2 = vec_ld( -1, src + FDEC_STRIDE ); vec_u8_t v3 = vec_ld( -1, src + FDEC_STRIDE * 2 ); vec_u8_t v4 = vec_ld( -1, src + FDEC_STRIDE * 3 ); vec_u8_t v5 = vec_ld( -1, src + FDEC_STRIDE * 4 ); vec_u8_t v6 = vec_ld( -1, src + FDEC_STRIDE * 5 ); vec_u8_t v7 = vec_ld( -1, src + FDEC_STRIDE * 6 ); vec_u8_t v8 = vec_ld( -1, src + FDEC_STRIDE * 7 ); vec_u8_t v9 = vec_ld( -1, src + FDEC_STRIDE * 8 ); vec_u8_t vA = vec_ld( -1, src + FDEC_STRIDE * 9 ); vec_u8_t vB = vec_ld( -1, src + FDEC_STRIDE * 10 ); vec_u8_t vC = vec_ld( -1, src + FDEC_STRIDE * 11 ); vec_u8_t vD = vec_ld( -1, src + FDEC_STRIDE * 12 ); vec_u8_t vE = vec_ld( -1, src + FDEC_STRIDE * 13 ); vec_u8_t vF = vec_ld( -1, src + FDEC_STRIDE * 14 ); vec_u8_t vG = vec_ld( -1, src + FDEC_STRIDE * 15 ); vec_u8_t v_v1 = vec_splat( v1, 15 ); vec_u8_t v_v2 = vec_splat( v2, 15 ); vec_u8_t v_v3 = vec_splat( v3, 15 ); vec_u8_t v_v4 = vec_splat( v4, 15 ); vec_u8_t v_v5 = vec_splat( v5, 15 ); vec_u8_t v_v6 = vec_splat( v6, 15 ); vec_u8_t v_v7 = vec_splat( v7, 15 ); vec_u8_t v_v8 = vec_splat( v8, 15 ); vec_u8_t v_v9 = vec_splat( v9, 15 ); vec_u8_t v_vA = vec_splat( vA, 15 ); vec_u8_t v_vB = vec_splat( vB, 15 ); vec_u8_t v_vC = vec_splat( vC, 15 ); vec_u8_t v_vD = vec_splat( vD, 15 ); vec_u8_t v_vE = vec_splat( vE, 15 ); vec_u8_t v_vF = vec_splat( vF, 15 ); vec_u8_t v_vG = vec_splat( vG, 15 ); vec_st( v_v1, 0, src ); vec_st( v_v2, 0, src + FDEC_STRIDE ); vec_st( v_v3, 0, src + FDEC_STRIDE * 2 ); vec_st( v_v4, 0, src + FDEC_STRIDE * 3 ); vec_st( v_v5, 0, src + FDEC_STRIDE * 4 ); vec_st( v_v6, 0, src + FDEC_STRIDE * 5 ); vec_st( v_v7, 0, src + FDEC_STRIDE * 6 ); vec_st( v_v8, 0, src + FDEC_STRIDE * 7 ); vec_st( v_v9, 0, src + FDEC_STRIDE * 8 ); vec_st( v_vA, 0, src + FDEC_STRIDE * 9 ); vec_st( v_vB, 0, src + FDEC_STRIDE * 10 ); vec_st( v_vC, 0, src + FDEC_STRIDE * 11 ); vec_st( v_vD, 0, src + FDEC_STRIDE * 12 ); vec_st( v_vE, 0, src + FDEC_STRIDE * 13 ); vec_st( v_vF, 0, src + FDEC_STRIDE * 14 ); vec_st( v_vG, 0, src + FDEC_STRIDE * 15 ); } static void predict_16x16_v_altivec( uint8_t *src ) { vec_u32_u v; v.s[0] = *(uint32_t*)&src[ 0-FDEC_STRIDE]; v.s[1] = *(uint32_t*)&src[ 4-FDEC_STRIDE]; v.s[2] = *(uint32_t*)&src[ 8-FDEC_STRIDE]; v.s[3] = *(uint32_t*)&src[12-FDEC_STRIDE]; for( int i = 0; i < 16; i++ ) { vec_st(v.v, 0, (uint32_t*)src); src += FDEC_STRIDE; } } #endif // !HIGH_BIT_DEPTH /**************************************************************************** * Exported functions: ****************************************************************************/ void x264_predict_16x16_init_altivec( x264_predict_t pf[7] ) { #if !HIGH_BIT_DEPTH pf[I_PRED_16x16_V ] = predict_16x16_v_altivec; pf[I_PRED_16x16_H ] = predict_16x16_h_altivec; pf[I_PRED_16x16_DC] = predict_16x16_dc_altivec; pf[I_PRED_16x16_P ] = predict_16x16_p_altivec; pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec; pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec; pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec; #endif // !HIGH_BIT_DEPTH } void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] ) { #if !HIGH_BIT_DEPTH pf[I_PRED_CHROMA_P] = predict_8x8c_p_altivec; #endif // !HIGH_BIT_DEPTH } x264-master/common/ppc/predict.h000066400000000000000000000030751502133446700167410ustar00rootroot00000000000000/***************************************************************************** * predict.h: ppc intra prediction ***************************************************************************** * Copyright (C) 2007-2025 x264 project * * Authors: Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PPC_PREDICT_H #define X264_PPC_PREDICT_H #define x264_predict_16x16_init_altivec x264_template(predict_16x16_init_altivec) void x264_predict_16x16_init_altivec( x264_predict_t pf[7] ); #define x264_predict_8x8c_init_altivec x264_template(predict_8x8c_init_altivec) void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] ); #endif /* X264_PPC_PREDICT_H */ x264-master/common/ppc/quant.c000066400000000000000000000513711502133446700164340ustar00rootroot00000000000000/***************************************************************************** * quant.c: ppc quantization ***************************************************************************** * Copyright (C) 2007-2025 x264 project * * Authors: Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "ppccommon.h" #include "quant.h" #if !HIGH_BIT_DEPTH // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" #define QUANT_16_U( idx0, idx1 ) \ { \ temp1v = vec_ld((idx0), dct); \ temp2v = vec_ld((idx1), dct); \ mfvA = vec_ld((idx0), mf); \ mfvB = vec_ld((idx1), mf); \ biasvA = vec_ld((idx0), bias); \ biasvB = vec_ld((idx1), bias); \ mskA = vec_cmplt(temp1v, zero_s16v); \ mskB = vec_cmplt(temp2v, zero_s16v); \ coefvA = (vec_u16_t)vec_abs( temp1v ); \ coefvB = (vec_u16_t)vec_abs( temp2v ); \ coefvA = vec_adds(coefvA, biasvA); \ coefvB = vec_adds(coefvB, biasvB); \ multEvenvA = vec_mule(coefvA, mfvA); \ multOddvA = vec_mulo(coefvA, mfvA); \ multEvenvB = vec_mule(coefvB, mfvB); \ multOddvB = vec_mulo(coefvB, mfvB); \ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \ multOddvA = vec_sr(multOddvA, i_qbitsv); \ multEvenvB = vec_sr(multEvenvB, i_qbitsv); \ multOddvB = vec_sr(multOddvB, i_qbitsv); \ temp1v = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \ tmpv = xxpermdi( temp1v, temp1v, 2 ); \ temp1v = vec_mergeh( temp1v, tmpv ); \ temp2v = (vec_s16_t) vec_packs( multEvenvB, multOddvB ); \ tmpv = xxpermdi( temp2v, temp2v, 2 ); \ temp2v = vec_mergeh( temp2v, tmpv ); \ temp1v = vec_xor(temp1v, mskA); \ temp2v = vec_xor(temp2v, mskB); \ temp1v = vec_adds(temp1v, vec_and(mskA, one)); \ vec_st(temp1v, (idx0), dct); \ temp2v = vec_adds(temp2v, vec_and(mskB, one)); \ nz = vec_or(nz, vec_or(temp1v, temp2v)); \ vec_st(temp2v, (idx1), dct); \ } int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) { LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 ); vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; vec_u16_t biasvA; vec_s16_t one = vec_splat_s16(1); vec_s16_t nz = zero_s16v; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; vec_u16_t biasvB; vec_s16_t temp1v, temp2v, tmpv; QUANT_16_U( 0, 16 ); return vec_any_ne(nz, zero_s16v); } int x264_quant_4x4x4_altivec( dctcoef dcta[4][16], udctcoef mf[16], udctcoef bias[16] ) { LOAD_ZERO; vec_u32_t i_qbitsv = vec_splats( (uint32_t)16 ); vec_s16_t one = vec_splat_s16( 1 ); vec_s16_t nz0, nz1, nz2, nz3; vector bool short mskA0; vec_u16_t coefvA0; vec_u32_t multEvenvA0, multOddvA0; vec_u16_t mfvA0; vec_u16_t biasvA0; vector bool short mskB0; vec_u16_t coefvB0; vec_u32_t multEvenvB0, multOddvB0; vec_u16_t mfvB0; vec_u16_t biasvB0; vector bool short mskA1; vec_u16_t coefvA1; vec_u32_t multEvenvA1, multOddvA1; vec_u16_t mfvA1; vec_u16_t biasvA1; vector bool short mskB1; vec_u16_t coefvB1; vec_u32_t multEvenvB1, multOddvB1; vec_u16_t mfvB1; vec_u16_t biasvB1; vector bool short mskA2; vec_u16_t coefvA2; vec_u32_t multEvenvA2, multOddvA2; vec_u16_t mfvA2; vec_u16_t biasvA2; vector bool short mskB2; vec_u16_t coefvB2; vec_u32_t multEvenvB2, multOddvB2; vec_u16_t mfvB2; vec_u16_t biasvB2; vector bool short mskA3; vec_u16_t coefvA3; vec_u32_t multEvenvA3, multOddvA3; vec_u16_t mfvA3; vec_u16_t biasvA3; vector bool short mskB3; vec_u16_t coefvB3; vec_u32_t multEvenvB3, multOddvB3; vec_u16_t mfvB3; vec_u16_t biasvB3; vec_s16_t temp1v, temp2v; vec_s16_t tmpv0; vec_s16_t tmpv1; dctcoef *dct0 = dcta[0]; dctcoef *dct1 = dcta[1]; dctcoef *dct2 = dcta[2]; dctcoef *dct3 = dcta[3]; temp1v = vec_ld( 0, dct0 ); temp2v = vec_ld( 16, dct0 ); mfvA0 = vec_ld( 0, mf ); mfvB0 = vec_ld( 16, mf ); biasvA0 = vec_ld( 0, bias ); biasvB0 = vec_ld( 16, bias ); mskA0 = vec_cmplt( temp1v, zero_s16v ); mskB0 = vec_cmplt( temp2v, zero_s16v ); coefvA0 = (vec_u16_t)vec_abs( temp1v ); coefvB0 = (vec_u16_t)vec_abs( temp2v ); temp1v = vec_ld( 0, dct1 ); temp2v = vec_ld( 16, dct1 ); mfvA1 = vec_ld( 0, mf ); mfvB1 = vec_ld( 16, mf ); biasvA1 = vec_ld( 0, bias ); biasvB1 = vec_ld( 16, bias ); mskA1 = vec_cmplt( temp1v, zero_s16v ); mskB1 = vec_cmplt( temp2v, zero_s16v ); coefvA1 = (vec_u16_t)vec_abs( temp1v ); coefvB1 = (vec_u16_t)vec_abs( temp2v ); temp1v = vec_ld( 0, dct2 ); temp2v = vec_ld( 16, dct2 ); mfvA2 = vec_ld( 0, mf ); mfvB2 = vec_ld( 16, mf ); biasvA2 = vec_ld( 0, bias ); biasvB2 = vec_ld( 16, bias ); mskA2 = vec_cmplt( temp1v, zero_s16v ); mskB2 = vec_cmplt( temp2v, zero_s16v ); coefvA2 = (vec_u16_t)vec_abs( temp1v ); coefvB2 = (vec_u16_t)vec_abs( temp2v ); temp1v = vec_ld( 0, dct3 ); temp2v = vec_ld( 16, dct3 ); mfvA3 = vec_ld( 0, mf ); mfvB3 = vec_ld( 16, mf ); biasvA3 = vec_ld( 0, bias ); biasvB3 = vec_ld( 16, bias ); mskA3 = vec_cmplt( temp1v, zero_s16v ); mskB3 = vec_cmplt( temp2v, zero_s16v ); coefvA3 = (vec_u16_t)vec_abs( temp1v ); coefvB3 = (vec_u16_t)vec_abs( temp2v ); coefvA0 = vec_adds( coefvA0, biasvA0 ); coefvB0 = vec_adds( coefvB0, biasvB0 ); coefvA1 = vec_adds( coefvA1, biasvA1 ); coefvB1 = vec_adds( coefvB1, biasvB1 ); coefvA2 = vec_adds( coefvA2, biasvA2 ); coefvB2 = vec_adds( coefvB2, biasvB2 ); coefvA3 = vec_adds( coefvA3, biasvA3 ); coefvB3 = vec_adds( coefvB3, biasvB3 ); multEvenvA0 = vec_mule( coefvA0, mfvA0 ); multOddvA0 = vec_mulo( coefvA0, mfvA0 ); multEvenvB0 = vec_mule( coefvB0, mfvB0 ); multOddvB0 = vec_mulo( coefvB0, mfvB0 ); multEvenvA0 = vec_sr( multEvenvA0, i_qbitsv ); multOddvA0 = vec_sr( multOddvA0, i_qbitsv ); multEvenvB0 = vec_sr( multEvenvB0, i_qbitsv ); multOddvB0 = vec_sr( multOddvB0, i_qbitsv ); temp1v = (vec_s16_t)vec_packs( multEvenvA0, multOddvA0 ); temp2v = (vec_s16_t)vec_packs( multEvenvB0, multOddvB0 ); tmpv0 = xxpermdi( temp1v, temp1v, 2 ); tmpv1 = xxpermdi( temp2v, temp2v, 2 ); temp1v = vec_mergeh( temp1v, tmpv0 ); temp2v = vec_mergeh( temp2v, tmpv1 ); temp1v = vec_xor( temp1v, mskA0 ); temp2v = vec_xor( temp2v, mskB0 ); temp1v = vec_adds( temp1v, vec_and( mskA0, one ) ); temp2v = vec_adds( temp2v, vec_and( mskB0, one ) ); vec_st( temp1v, 0, dct0 ); vec_st( temp2v, 16, dct0 ); nz0 = vec_or( temp1v, temp2v ); multEvenvA1 = vec_mule( coefvA1, mfvA1 ); multOddvA1 = vec_mulo( coefvA1, mfvA1 ); multEvenvB1 = vec_mule( coefvB1, mfvB1 ); multOddvB1 = vec_mulo( coefvB1, mfvB1 ); multEvenvA1 = vec_sr( multEvenvA1, i_qbitsv ); multOddvA1 = vec_sr( multOddvA1, i_qbitsv ); multEvenvB1 = vec_sr( multEvenvB1, i_qbitsv ); multOddvB1 = vec_sr( multOddvB1, i_qbitsv ); temp1v = (vec_s16_t)vec_packs( multEvenvA1, multOddvA1 ); temp2v = (vec_s16_t)vec_packs( multEvenvB1, multOddvB1 ); tmpv0 = xxpermdi( temp1v, temp1v, 2 ); tmpv1 = xxpermdi( temp2v, temp2v, 2 ); temp1v = vec_mergeh( temp1v, tmpv0 ); temp2v = vec_mergeh( temp2v, tmpv1 ); temp1v = vec_xor( temp1v, mskA1 ); temp2v = vec_xor( temp2v, mskB1 ); temp1v = vec_adds( temp1v, vec_and( mskA1, one ) ); temp2v = vec_adds( temp2v, vec_and( mskB1, one ) ); vec_st( temp1v, 0, dct1 ); vec_st( temp2v, 16, dct1 ); nz1 = vec_or( temp1v, temp2v ); multEvenvA2 = vec_mule( coefvA2, mfvA2 ); multOddvA2 = vec_mulo( coefvA2, mfvA2 ); multEvenvB2 = vec_mule( coefvB2, mfvB2 ); multOddvB2 = vec_mulo( coefvB2, mfvB2 ); multEvenvA2 = vec_sr( multEvenvA2, i_qbitsv ); multOddvA2 = vec_sr( multOddvA2, i_qbitsv ); multEvenvB2 = vec_sr( multEvenvB2, i_qbitsv ); multOddvB2 = vec_sr( multOddvB2, i_qbitsv ); temp1v = (vec_s16_t)vec_packs( multEvenvA2, multOddvA2 ); temp2v = (vec_s16_t)vec_packs( multEvenvB2, multOddvB2 ); tmpv0 = xxpermdi( temp1v, temp1v, 2 ); tmpv1 = xxpermdi( temp2v, temp2v, 2 ); temp1v = vec_mergeh( temp1v, tmpv0 ); temp2v = vec_mergeh( temp2v, tmpv1 ); temp1v = vec_xor( temp1v, mskA2 ); temp2v = vec_xor( temp2v, mskB2 ); temp1v = vec_adds( temp1v, vec_and( mskA2, one ) ); temp2v = vec_adds( temp2v, vec_and( mskB2, one ) ); vec_st( temp1v, 0, dct2 ); vec_st( temp2v, 16, dct2 ); nz2 = vec_or( temp1v, temp2v ); multEvenvA3 = vec_mule( coefvA3, mfvA3 ); multOddvA3 = vec_mulo( coefvA3, mfvA3 ); multEvenvB3 = vec_mule( coefvB3, mfvB3 ); multOddvB3 = vec_mulo( coefvB3, mfvB3 ); multEvenvA3 = vec_sr( multEvenvA3, i_qbitsv ); multOddvA3 = vec_sr( multOddvA3, i_qbitsv ); multEvenvB3 = vec_sr( multEvenvB3, i_qbitsv ); multOddvB3 = vec_sr( multOddvB3, i_qbitsv ); temp1v = (vec_s16_t)vec_packs( multEvenvA3, multOddvA3 ); temp2v = (vec_s16_t)vec_packs( multEvenvB3, multOddvB3 ); tmpv0 = xxpermdi( temp1v, temp1v, 2 ); tmpv1 = xxpermdi( temp2v, temp2v, 2 ); temp1v = vec_mergeh( temp1v, tmpv0 ); temp2v = vec_mergeh( temp2v, tmpv1 ); temp1v = vec_xor( temp1v, mskA3 ); temp2v = vec_xor( temp2v, mskB3 ); temp1v = vec_adds( temp1v, vec_and( mskA3, one ) ); temp2v = vec_adds( temp2v, vec_and( mskB3, one ) ); vec_st( temp1v, 0, dct3 ); vec_st( temp2v, 16, dct3 ); nz3 = vec_or( temp1v, temp2v ); return (vec_any_ne( nz0, zero_s16v ) << 0) | (vec_any_ne( nz1, zero_s16v ) << 1) | (vec_any_ne( nz2, zero_s16v ) << 2) | (vec_any_ne( nz3, zero_s16v ) << 3); } // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" #define QUANT_16_U_DC( idx0, idx1 ) \ { \ temp1v = vec_ld((idx0), dct); \ temp2v = vec_ld((idx1), dct); \ mskA = vec_cmplt(temp1v, zero_s16v); \ mskB = vec_cmplt(temp2v, zero_s16v); \ coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\ coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\ coefvA = vec_add(coefvA, biasv); \ coefvB = vec_add(coefvB, biasv); \ multEvenvA = vec_mule(coefvA, mfv); \ multOddvA = vec_mulo(coefvA, mfv); \ multEvenvB = vec_mule(coefvB, mfv); \ multOddvB = vec_mulo(coefvB, mfv); \ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \ multOddvA = vec_sr(multOddvA, i_qbitsv); \ multEvenvB = vec_sr(multEvenvB, i_qbitsv); \ multOddvB = vec_sr(multOddvB, i_qbitsv); \ temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \ temp1v = vec_xor(temp1v, mskA); \ temp2v = vec_xor(temp2v, mskB); \ temp1v = vec_add(temp1v, vec_and(mskA, one)); \ vec_st(temp1v, (idx0), dct); \ temp2v = vec_add(temp2v, vec_and(mskB, one)); \ nz = vec_or(nz, vec_or(temp1v, temp2v)); \ vec_st(temp2v, (idx1), dct); \ } int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias ) { LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_s16_t one = vec_splat_s16(1); vec_s16_t nz = zero_s16v; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_s16_t temp1v, temp2v; vec_u16_t mfv; vec_u16_t biasv; mfv = vec_splats( (uint16_t)mf ); i_qbitsv = vec_splats( (uint32_t) 16 ); biasv = vec_splats( (uint16_t)bias ); QUANT_16_U_DC( 0, 16 ); return vec_any_ne(nz, zero_s16v); } // DC quant of a whole 2x2 block #define QUANT_4_U_DC( idx0 ) \ { \ const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \ temp1v = vec_ld((idx0), dct); \ mskA = vec_cmplt(temp1v, zero_s16v); \ coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\ coefvA = vec_add(coefvA, biasv); \ multEvenvA = vec_mule(coefvA, mfv); \ multOddvA = vec_mulo(coefvA, mfv); \ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \ multOddvA = vec_sr(multOddvA, i_qbitsv); \ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \ temp2v = vec_xor(temp2v, mskA); \ temp2v = vec_add(temp2v, vec_and(mskA, one)); \ temp1v = vec_sel(temp1v, temp2v, sel); \ nz = vec_or(nz, temp1v); \ vec_st(temp1v, (idx0), dct); \ } int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias ) { LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_s16_t one = vec_splat_s16(1); vec_s16_t nz = zero_s16v; static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0); vec_s16_t temp1v, temp2v; vec_u16_t mfv; vec_u16_t biasv; mfv = vec_splats( (uint16_t)mf ); i_qbitsv = vec_splats( (uint32_t) 16 ); biasv = vec_splats( (uint16_t)bias ); QUANT_4_U_DC(0); return vec_any_ne(vec_and(nz, mask2), zero_s16v); } int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) { LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; vec_u16_t biasvA; vec_s16_t one = vec_splat_s16(1); vec_s16_t nz = zero_s16v; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; vec_u16_t biasvB; vec_s16_t temp1v, temp2v, tmpv; i_qbitsv = vec_splats( (uint32_t)16 ); for( int i = 0; i < 4; i++ ) QUANT_16_U( i*2*16, i*2*16+16 ); return vec_any_ne(nz, zero_s16v); } #define DEQUANT_SHL() \ { \ dctv = vec_ld(8*y, dct); \ mf1v = vec_ld(16*y, dequant_mf[i_mf]); \ mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \ mfv = vec_packs(mf1v, mf2v); \ \ multEvenvA = vec_mule(dctv, mfv); \ multOddvA = vec_mulo(dctv, mfv); \ dctv = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \ tmpv = xxpermdi( dctv, dctv, 2 ); \ dctv = vec_mergeh( dctv, tmpv ); \ dctv = vec_sl(dctv, i_qbitsv); \ vec_st(dctv, 8*y, dct); \ } #ifdef WORDS_BIGENDIAN #define VEC_MULE vec_mule #define VEC_MULO vec_mulo #else #define VEC_MULE vec_mulo #define VEC_MULO vec_mule #endif #define DEQUANT_SHR() \ { \ dctv = vec_ld(8*y, dct); \ dct1v = vec_mergeh(dctv, dctv); \ dct2v = vec_mergel(dctv, dctv); \ mf1v = vec_ld(16*y, dequant_mf[i_mf]); \ mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \ \ multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \ multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \ temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \ temp1v = vec_add(temp1v, fv); \ temp1v = vec_sra(temp1v, i_qbitsv); \ \ multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \ multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \ temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \ temp2v = vec_add(temp2v, fv); \ temp2v = vec_sra(temp2v, i_qbitsv); \ \ dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \ vec_st(dctv, y*8, dct); \ } void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp ) { int i_mf = i_qp%6; int i_qbits = i_qp/6 - 4; vec_s16_t dctv, tmpv; vec_s16_t dct1v, dct2v; vec_s32_t mf1v, mf2v; vec_s16_t mfv; vec_s32_t multEvenvA, multOddvA; vec_s32_t temp1v, temp2v; if( i_qbits >= 0 ) { vec_u16_t i_qbitsv; i_qbitsv = vec_splats( (uint16_t) i_qbits ); for( int y = 0; y < 4; y+=2 ) DEQUANT_SHL(); } else { const int f = 1 << (-i_qbits-1); vec_s32_t fv; fv = vec_splats( f ); vec_u32_t i_qbitsv; i_qbitsv = vec_splats( (uint32_t)-i_qbits ); vec_u32_t sixteenv; sixteenv = vec_splats( (uint32_t)16 ); for( int y = 0; y < 4; y+=2 ) DEQUANT_SHR(); } } void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp ) { int i_mf = i_qp%6; int i_qbits = i_qp/6 - 6; vec_s16_t dctv, tmpv; vec_s16_t dct1v, dct2v; vec_s32_t mf1v, mf2v; vec_s16_t mfv; vec_s32_t multEvenvA, multOddvA; vec_s32_t temp1v, temp2v; if( i_qbits >= 0 ) { vec_u16_t i_qbitsv; i_qbitsv = vec_splats((uint16_t)i_qbits ); for( int y = 0; y < 16; y+=2 ) DEQUANT_SHL(); } else { const int f = 1 << (-i_qbits-1); vec_s32_t fv; fv = vec_splats( f ); vec_u32_t i_qbitsv; i_qbitsv = vec_splats( (uint32_t)-i_qbits ); vec_u32_t sixteenv; sixteenv = vec_splats( (uint32_t)16 ); for( int y = 0; y < 16; y+=2 ) DEQUANT_SHR(); } } #endif // !HIGH_BIT_DEPTH x264-master/common/ppc/quant.h000066400000000000000000000044021502133446700164320ustar00rootroot00000000000000/***************************************************************************** * quant.h: ppc quantization ***************************************************************************** * Copyright (C) 2007-2025 x264 project * * Authors: Guillaume Poirier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PPC_QUANT_H #define X264_PPC_QUANT_H #define x264_quant_4x4x4_altivec x264_template(quant_4x4x4_altivec) int x264_quant_4x4x4_altivec( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); #define x264_quant_4x4_altivec x264_template(quant_4x4_altivec) int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); #define x264_quant_8x8_altivec x264_template(quant_8x8_altivec) int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); #define x264_quant_4x4_dc_altivec x264_template(quant_4x4_dc_altivec) int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias ); #define x264_quant_2x2_dc_altivec x264_template(quant_2x2_dc_altivec) int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias ); #define x264_dequant_4x4_altivec x264_template(dequant_4x4_altivec) void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_altivec x264_template(dequant_8x8_altivec) void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #endif x264-master/common/predict.c000066400000000000000000000766271502133446700161670ustar00rootroot00000000000000/***************************************************************************** * predict.c: intra prediction ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* predict4x4 are inspired from ffmpeg h264 decoder */ #include "common.h" #if HAVE_MMX # include "x86/predict.h" #endif #if HAVE_ALTIVEC # include "ppc/predict.h" #endif #if HAVE_ARMV6 # include "arm/predict.h" #endif #if HAVE_AARCH64 # include "aarch64/predict.h" #endif #if HAVE_MSA # include "mips/predict.h" #endif #if HAVE_LSX # include "loongarch/predict.h" #endif /**************************************************************************** * 16x16 prediction for intra luma block ****************************************************************************/ #define PREDICT_16x16_DC(v)\ for( int i = 0; i < 16; i++ )\ {\ MPIXEL_X4( src+ 0 ) = v;\ MPIXEL_X4( src+ 4 ) = v;\ MPIXEL_X4( src+ 8 ) = v;\ MPIXEL_X4( src+12 ) = v;\ src += FDEC_STRIDE;\ } void x264_predict_16x16_dc_c( pixel *src ) { int dc = 0; for( int i = 0; i < 16; i++ ) { dc += src[-1 + i * FDEC_STRIDE]; dc += src[i - FDEC_STRIDE]; } pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 ); PREDICT_16x16_DC( dcsplat ); } static void predict_16x16_dc_left_c( pixel *src ) { int dc = 0; for( int i = 0; i < 16; i++ ) dc += src[-1 + i * FDEC_STRIDE]; pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 ); PREDICT_16x16_DC( dcsplat ); } static void predict_16x16_dc_top_c( pixel *src ) { int dc = 0; for( int i = 0; i < 16; i++ ) dc += src[i - FDEC_STRIDE]; pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 ); PREDICT_16x16_DC( dcsplat ); } static void predict_16x16_dc_128_c( pixel *src ) { PREDICT_16x16_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) ); } void x264_predict_16x16_h_c( pixel *src ) { for( int i = 0; i < 16; i++ ) { const pixel4 v = PIXEL_SPLAT_X4( src[-1] ); MPIXEL_X4( src+ 0 ) = v; MPIXEL_X4( src+ 4 ) = v; MPIXEL_X4( src+ 8 ) = v; MPIXEL_X4( src+12 ) = v; src += FDEC_STRIDE; } } void x264_predict_16x16_v_c( pixel *src ) { pixel4 v0 = MPIXEL_X4( &src[ 0-FDEC_STRIDE] ); pixel4 v1 = MPIXEL_X4( &src[ 4-FDEC_STRIDE] ); pixel4 v2 = MPIXEL_X4( &src[ 8-FDEC_STRIDE] ); pixel4 v3 = MPIXEL_X4( &src[12-FDEC_STRIDE] ); for( int i = 0; i < 16; i++ ) { MPIXEL_X4( src+ 0 ) = v0; MPIXEL_X4( src+ 4 ) = v1; MPIXEL_X4( src+ 8 ) = v2; MPIXEL_X4( src+12 ) = v3; src += FDEC_STRIDE; } } void x264_predict_16x16_p_c( pixel *src ) { int H = 0, V = 0; /* calculate H and V */ for( int i = 0; i <= 7; i++ ) { H += ( i + 1 ) * ( src[ 8 + i - FDEC_STRIDE ] - src[6 -i -FDEC_STRIDE] ); V += ( i + 1 ) * ( src[-1 + (8+i)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] ); } int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[15 - FDEC_STRIDE] ); int b = ( 5 * H + 32 ) >> 6; int c = ( 5 * V + 32 ) >> 6; int i00 = a - b * 7 - c * 7 + 16; for( int y = 0; y < 16; y++ ) { int pix = i00; for( int x = 0; x < 16; x++ ) { src[x] = x264_clip_pixel( pix>>5 ); pix += b; } src += FDEC_STRIDE; i00 += c; } } /**************************************************************************** * 8x8 prediction for intra chroma block (4:2:0) ****************************************************************************/ static void predict_8x8c_dc_128_c( pixel *src ) { for( int y = 0; y < 8; y++ ) { MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); src += FDEC_STRIDE; } } static void predict_8x8c_dc_left_c( pixel *src ) { int dc0 = 0, dc1 = 0; for( int y = 0; y < 4; y++ ) { dc0 += src[y * FDEC_STRIDE - 1]; dc1 += src[(y+4) * FDEC_STRIDE - 1]; } pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc0splat; MPIXEL_X4( src+4 ) = dc0splat; src += FDEC_STRIDE; } for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc1splat; MPIXEL_X4( src+4 ) = dc1splat; src += FDEC_STRIDE; } } static void predict_8x8c_dc_top_c( pixel *src ) { int dc0 = 0, dc1 = 0; for( int x = 0; x < 4; x++ ) { dc0 += src[x - FDEC_STRIDE]; dc1 += src[x + 4 - FDEC_STRIDE]; } pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); for( int y = 0; y < 8; y++ ) { MPIXEL_X4( src+0 ) = dc0splat; MPIXEL_X4( src+4 ) = dc1splat; src += FDEC_STRIDE; } } void x264_predict_8x8c_dc_c( pixel *src ) { int s0 = 0, s1 = 0, s2 = 0, s3 = 0; /* s0 s1 s2 s3 */ for( int i = 0; i < 4; i++ ) { s0 += src[i - FDEC_STRIDE]; s1 += src[i + 4 - FDEC_STRIDE]; s2 += src[-1 + i * FDEC_STRIDE]; s3 += src[-1 + (i+4)*FDEC_STRIDE]; } /* dc0 dc1 dc2 dc3 */ pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 ); pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 ); pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 ); pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 ); for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc0; MPIXEL_X4( src+4 ) = dc1; src += FDEC_STRIDE; } for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc2; MPIXEL_X4( src+4 ) = dc3; src += FDEC_STRIDE; } } void x264_predict_8x8c_h_c( pixel *src ) { for( int i = 0; i < 8; i++ ) { pixel4 v = PIXEL_SPLAT_X4( src[-1] ); MPIXEL_X4( src+0 ) = v; MPIXEL_X4( src+4 ) = v; src += FDEC_STRIDE; } } void x264_predict_8x8c_v_c( pixel *src ) { pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE ); pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE ); for( int i = 0; i < 8; i++ ) { MPIXEL_X4( src+0 ) = v0; MPIXEL_X4( src+4 ) = v1; src += FDEC_STRIDE; } } void x264_predict_8x8c_p_c( pixel *src ) { int H = 0, V = 0; for( int i = 0; i < 4; i++ ) { H += ( i + 1 ) * ( src[4+i - FDEC_STRIDE] - src[2 - i -FDEC_STRIDE] ); V += ( i + 1 ) * ( src[-1 +(i+4)*FDEC_STRIDE] - src[-1+(2-i)*FDEC_STRIDE] ); } int a = 16 * ( src[-1+7*FDEC_STRIDE] + src[7 - FDEC_STRIDE] ); int b = ( 17 * H + 16 ) >> 5; int c = ( 17 * V + 16 ) >> 5; int i00 = a -3*b -3*c + 16; for( int y = 0; y < 8; y++ ) { int pix = i00; for( int x = 0; x < 8; x++ ) { src[x] = x264_clip_pixel( pix>>5 ); pix += b; } src += FDEC_STRIDE; i00 += c; } } /**************************************************************************** * 8x16 prediction for intra chroma block (4:2:2) ****************************************************************************/ static void predict_8x16c_dc_128_c( pixel *src ) { for( int y = 0; y < 16; y++ ) { MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); src += FDEC_STRIDE; } } static void predict_8x16c_dc_left_c( pixel *src ) { for( int i = 0; i < 4; i++ ) { int dc = 0; for( int y = 0; y < 4; y++ ) dc += src[y*FDEC_STRIDE - 1]; pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 ); for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dcsplat; MPIXEL_X4( src+4 ) = dcsplat; src += FDEC_STRIDE; } } } static void predict_8x16c_dc_top_c( pixel *src ) { int dc0 = 0, dc1 = 0; for( int x = 0; x < 4; x++ ) { dc0 += src[x - FDEC_STRIDE]; dc1 += src[x + 4 - FDEC_STRIDE]; } pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); for( int y = 0; y < 16; y++ ) { MPIXEL_X4( src+0 ) = dc0splat; MPIXEL_X4( src+4 ) = dc1splat; src += FDEC_STRIDE; } } void x264_predict_8x16c_dc_c( pixel *src ) { int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0; /* s0 s1 s2 s3 s4 s5 */ for( int i = 0; i < 4; i++ ) { s0 += src[i+0 - FDEC_STRIDE]; s1 += src[i+4 - FDEC_STRIDE]; s2 += src[-1 + (i+0) * FDEC_STRIDE]; s3 += src[-1 + (i+4) * FDEC_STRIDE]; s4 += src[-1 + (i+8) * FDEC_STRIDE]; s5 += src[-1 + (i+12) * FDEC_STRIDE]; } /* dc0 dc1 dc2 dc3 dc4 dc5 dc6 dc7 */ pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 ); pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 ); pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 ); pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 ); pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 ); pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 ); pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 ); pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 ); for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc0; MPIXEL_X4( src+4 ) = dc1; src += FDEC_STRIDE; } for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc2; MPIXEL_X4( src+4 ) = dc3; src += FDEC_STRIDE; } for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc4; MPIXEL_X4( src+4 ) = dc5; src += FDEC_STRIDE; } for( int y = 0; y < 4; y++ ) { MPIXEL_X4( src+0 ) = dc6; MPIXEL_X4( src+4 ) = dc7; src += FDEC_STRIDE; } } void x264_predict_8x16c_h_c( pixel *src ) { for( int i = 0; i < 16; i++ ) { pixel4 v = PIXEL_SPLAT_X4( src[-1] ); MPIXEL_X4( src+0 ) = v; MPIXEL_X4( src+4 ) = v; src += FDEC_STRIDE; } } void x264_predict_8x16c_v_c( pixel *src ) { pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE ); pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE ); for( int i = 0; i < 16; i++ ) { MPIXEL_X4( src+0 ) = v0; MPIXEL_X4( src+4 ) = v1; src += FDEC_STRIDE; } } void x264_predict_8x16c_p_c( pixel *src ) { int H = 0; int V = 0; for( int i = 0; i < 4; i++ ) H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] ); for( int i = 0; i < 8; i++ ) V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] ); int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] ); int b = ( 17 * H + 16 ) >> 5; int c = ( 5 * V + 32 ) >> 6; int i00 = a -3*b -7*c + 16; for( int y = 0; y < 16; y++ ) { int pix = i00; for( int x = 0; x < 8; x++ ) { src[x] = x264_clip_pixel( pix>>5 ); pix += b; } src += FDEC_STRIDE; i00 += c; } } /**************************************************************************** * 4x4 prediction for intra luma block ****************************************************************************/ #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE] #define SRC_X4(x,y) MPIXEL_X4( &SRC(x,y) ) #define PREDICT_4x4_DC(v)\ SRC_X4(0,0) = SRC_X4(0,1) = SRC_X4(0,2) = SRC_X4(0,3) = v; static void predict_4x4_dc_128_c( pixel *src ) { PREDICT_4x4_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) ); } static void predict_4x4_dc_left_c( pixel *src ) { pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) + 2) >> 2 ); PREDICT_4x4_DC( dc ); } static void predict_4x4_dc_top_c( pixel *src ) { pixel4 dc = PIXEL_SPLAT_X4( (SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 2) >> 2 ); PREDICT_4x4_DC( dc ); } void x264_predict_4x4_dc_c( pixel *src ) { pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) + SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 4) >> 3 ); PREDICT_4x4_DC( dc ); } void x264_predict_4x4_h_c( pixel *src ) { SRC_X4(0,0) = PIXEL_SPLAT_X4( SRC(-1,0) ); SRC_X4(0,1) = PIXEL_SPLAT_X4( SRC(-1,1) ); SRC_X4(0,2) = PIXEL_SPLAT_X4( SRC(-1,2) ); SRC_X4(0,3) = PIXEL_SPLAT_X4( SRC(-1,3) ); } void x264_predict_4x4_v_c( pixel *src ) { PREDICT_4x4_DC(SRC_X4(0,-1)); } #define PREDICT_4x4_LOAD_LEFT\ int l0 = SRC(-1,0);\ int l1 = SRC(-1,1);\ int l2 = SRC(-1,2);\ UNUSED int l3 = SRC(-1,3); #define PREDICT_4x4_LOAD_TOP\ int t0 = SRC(0,-1);\ int t1 = SRC(1,-1);\ int t2 = SRC(2,-1);\ UNUSED int t3 = SRC(3,-1); #define PREDICT_4x4_LOAD_TOP_RIGHT\ int t4 = SRC(4,-1);\ int t5 = SRC(5,-1);\ int t6 = SRC(6,-1);\ UNUSED int t7 = SRC(7,-1); #define F1(a,b) (((a)+(b)+1)>>1) #define F2(a,b,c) (((a)+2*(b)+(c)+2)>>2) static void predict_4x4_ddl_c( pixel *src ) { PREDICT_4x4_LOAD_TOP PREDICT_4x4_LOAD_TOP_RIGHT SRC(0,0)= F2(t0,t1,t2); SRC(1,0)=SRC(0,1)= F2(t1,t2,t3); SRC(2,0)=SRC(1,1)=SRC(0,2)= F2(t2,t3,t4); SRC(3,0)=SRC(2,1)=SRC(1,2)=SRC(0,3)= F2(t3,t4,t5); SRC(3,1)=SRC(2,2)=SRC(1,3)= F2(t4,t5,t6); SRC(3,2)=SRC(2,3)= F2(t5,t6,t7); SRC(3,3)= F2(t6,t7,t7); } static void predict_4x4_ddr_c( pixel *src ) { int lt = SRC(-1,-1); PREDICT_4x4_LOAD_LEFT PREDICT_4x4_LOAD_TOP SRC(3,0)= F2(t3,t2,t1); SRC(2,0)=SRC(3,1)= F2(t2,t1,t0); SRC(1,0)=SRC(2,1)=SRC(3,2)= F2(t1,t0,lt); SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)= F2(t0,lt,l0); SRC(0,1)=SRC(1,2)=SRC(2,3)= F2(lt,l0,l1); SRC(0,2)=SRC(1,3)= F2(l0,l1,l2); SRC(0,3)= F2(l1,l2,l3); } static void predict_4x4_vr_c( pixel *src ) { int lt = SRC(-1,-1); PREDICT_4x4_LOAD_LEFT PREDICT_4x4_LOAD_TOP SRC(0,3)= F2(l2,l1,l0); SRC(0,2)= F2(l1,l0,lt); SRC(0,1)=SRC(1,3)= F2(l0,lt,t0); SRC(0,0)=SRC(1,2)= F1(lt,t0); SRC(1,1)=SRC(2,3)= F2(lt,t0,t1); SRC(1,0)=SRC(2,2)= F1(t0,t1); SRC(2,1)=SRC(3,3)= F2(t0,t1,t2); SRC(2,0)=SRC(3,2)= F1(t1,t2); SRC(3,1)= F2(t1,t2,t3); SRC(3,0)= F1(t2,t3); } static void predict_4x4_hd_c( pixel *src ) { int lt= SRC(-1,-1); PREDICT_4x4_LOAD_LEFT PREDICT_4x4_LOAD_TOP SRC(0,3)= F1(l2,l3); SRC(1,3)= F2(l1,l2,l3); SRC(0,2)=SRC(2,3)= F1(l1,l2); SRC(1,2)=SRC(3,3)= F2(l0,l1,l2); SRC(0,1)=SRC(2,2)= F1(l0,l1); SRC(1,1)=SRC(3,2)= F2(lt,l0,l1); SRC(0,0)=SRC(2,1)= F1(lt,l0); SRC(1,0)=SRC(3,1)= F2(t0,lt,l0); SRC(2,0)= F2(t1,t0,lt); SRC(3,0)= F2(t2,t1,t0); } static void predict_4x4_vl_c( pixel *src ) { PREDICT_4x4_LOAD_TOP PREDICT_4x4_LOAD_TOP_RIGHT SRC(0,0)= F1(t0,t1); SRC(0,1)= F2(t0,t1,t2); SRC(1,0)=SRC(0,2)= F1(t1,t2); SRC(1,1)=SRC(0,3)= F2(t1,t2,t3); SRC(2,0)=SRC(1,2)= F1(t2,t3); SRC(2,1)=SRC(1,3)= F2(t2,t3,t4); SRC(3,0)=SRC(2,2)= F1(t3,t4); SRC(3,1)=SRC(2,3)= F2(t3,t4,t5); SRC(3,2)= F1(t4,t5); SRC(3,3)= F2(t4,t5,t6); } static void predict_4x4_hu_c( pixel *src ) { PREDICT_4x4_LOAD_LEFT SRC(0,0)= F1(l0,l1); SRC(1,0)= F2(l0,l1,l2); SRC(2,0)=SRC(0,1)= F1(l1,l2); SRC(3,0)=SRC(1,1)= F2(l1,l2,l3); SRC(2,1)=SRC(0,2)= F1(l2,l3); SRC(3,1)=SRC(1,2)= F2(l2,l3,l3); SRC(3,2)=SRC(1,3)=SRC(0,3)= SRC(2,2)=SRC(2,3)=SRC(3,3)= l3; } /**************************************************************************** * 8x8 prediction for intra luma block ****************************************************************************/ #define PL(y) \ edge[14-y] = F2(SRC(-1,y-1), SRC(-1,y), SRC(-1,y+1)); #define PT(x) \ edge[16+x] = F2(SRC(x-1,-1), SRC(x,-1), SRC(x+1,-1)); static void predict_8x8_filter_c( pixel *src, pixel edge[36], int i_neighbor, int i_filters ) { /* edge[7..14] = l7..l0 * edge[15] = lt * edge[16..31] = t0 .. t15 * edge[32] = t15 */ int have_lt = i_neighbor & MB_TOPLEFT; if( i_filters & MB_LEFT ) { edge[15] = (SRC(0,-1) + 2*SRC(-1,-1) + SRC(-1,0) + 2) >> 2; edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0)) + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) edge[6] = edge[7] = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2; } if( i_filters & MB_TOP ) { int have_tr = i_neighbor & MB_TOPRIGHT; edge[16] = ((have_lt ? SRC(-1,-1) : SRC(0,-1)) + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) edge[23] = (SRC(6,-1) + 2*SRC(7,-1) + (have_tr ? SRC(8,-1) : SRC(7,-1)) + 2) >> 2; if( i_filters & MB_TOPRIGHT ) { if( have_tr ) { PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14) edge[31] = edge[32] = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; } else { MPIXEL_X4( edge+24 ) = PIXEL_SPLAT_X4( SRC(7,-1) ); MPIXEL_X4( edge+28 ) = PIXEL_SPLAT_X4( SRC(7,-1) ); edge[32] = SRC(7,-1); } } } } #undef PL #undef PT #define PL(y) \ UNUSED int l##y = edge[14-y]; #define PT(x) \ UNUSED int t##x = edge[16+x]; #define PREDICT_8x8_LOAD_TOPLEFT \ int lt = edge[15]; #define PREDICT_8x8_LOAD_LEFT \ PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7) #define PREDICT_8x8_LOAD_TOP \ PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7) #define PREDICT_8x8_LOAD_TOPRIGHT \ PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14) PT(15) #define PREDICT_8x8_DC(v) \ for( int y = 0; y < 8; y++ ) { \ MPIXEL_X4( src+0 ) = v; \ MPIXEL_X4( src+4 ) = v; \ src += FDEC_STRIDE; \ } static void predict_8x8_dc_128_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) ); } static void predict_8x8_dc_left_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_LEFT pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3 ); PREDICT_8x8_DC( dc ); } static void predict_8x8_dc_top_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_TOP pixel4 dc = PIXEL_SPLAT_X4( (t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3 ); PREDICT_8x8_DC( dc ); } void x264_predict_8x8_dc_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_LEFT PREDICT_8x8_LOAD_TOP pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4 ); PREDICT_8x8_DC( dc ); } void x264_predict_8x8_h_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_LEFT #define ROW(y) MPIXEL_X4( src+y*FDEC_STRIDE+0 ) =\ MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = PIXEL_SPLAT_X4( l##y ); ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); #undef ROW } void x264_predict_8x8_v_c( pixel *src, pixel edge[36] ) { pixel4 top[2] = { MPIXEL_X4( edge+16 ), MPIXEL_X4( edge+20 ) }; for( int y = 0; y < 8; y++ ) { MPIXEL_X4( src+y*FDEC_STRIDE+0 ) = top[0]; MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1]; } } static void predict_8x8_ddl_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_TOPRIGHT SRC(0,0)= F2(t0,t1,t2); SRC(0,1)=SRC(1,0)= F2(t1,t2,t3); SRC(0,2)=SRC(1,1)=SRC(2,0)= F2(t2,t3,t4); SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= F2(t3,t4,t5); SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= F2(t4,t5,t6); SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= F2(t5,t6,t7); SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= F2(t6,t7,t8); SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= F2(t7,t8,t9); SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= F2(t8,t9,t10); SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= F2(t9,t10,t11); SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= F2(t10,t11,t12); SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= F2(t11,t12,t13); SRC(5,7)=SRC(6,6)=SRC(7,5)= F2(t12,t13,t14); SRC(6,7)=SRC(7,6)= F2(t13,t14,t15); SRC(7,7)= F2(t14,t15,t15); } static void predict_8x8_ddr_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_LEFT PREDICT_8x8_LOAD_TOPLEFT SRC(0,7)= F2(l7,l6,l5); SRC(0,6)=SRC(1,7)= F2(l6,l5,l4); SRC(0,5)=SRC(1,6)=SRC(2,7)= F2(l5,l4,l3); SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= F2(l4,l3,l2); SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= F2(l3,l2,l1); SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= F2(l2,l1,l0); SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= F2(l1,l0,lt); SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= F2(l0,lt,t0); SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= F2(lt,t0,t1); SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= F2(t0,t1,t2); SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= F2(t1,t2,t3); SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= F2(t2,t3,t4); SRC(5,0)=SRC(6,1)=SRC(7,2)= F2(t3,t4,t5); SRC(6,0)=SRC(7,1)= F2(t4,t5,t6); SRC(7,0)= F2(t5,t6,t7); } static void predict_8x8_vr_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_LEFT PREDICT_8x8_LOAD_TOPLEFT SRC(0,6)= F2(l5,l4,l3); SRC(0,7)= F2(l6,l5,l4); SRC(0,4)=SRC(1,6)= F2(l3,l2,l1); SRC(0,5)=SRC(1,7)= F2(l4,l3,l2); SRC(0,2)=SRC(1,4)=SRC(2,6)= F2(l1,l0,lt); SRC(0,3)=SRC(1,5)=SRC(2,7)= F2(l2,l1,l0); SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= F2(l0,lt,t0); SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= F1(lt,t0); SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= F2(lt,t0,t1); SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= F1(t0,t1); SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= F2(t0,t1,t2); SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= F1(t1,t2); SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= F2(t1,t2,t3); SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= F1(t2,t3); SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= F2(t2,t3,t4); SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= F1(t3,t4); SRC(5,1)=SRC(6,3)=SRC(7,5)= F2(t3,t4,t5); SRC(5,0)=SRC(6,2)=SRC(7,4)= F1(t4,t5); SRC(6,1)=SRC(7,3)= F2(t4,t5,t6); SRC(6,0)=SRC(7,2)= F1(t5,t6); SRC(7,1)= F2(t5,t6,t7); SRC(7,0)= F1(t6,t7); } static void predict_8x8_hd_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_LEFT PREDICT_8x8_LOAD_TOPLEFT int p1 = pack_pixel_1to2(F1(l6,l7), F2(l5,l6,l7)); int p2 = pack_pixel_1to2(F1(l5,l6), F2(l4,l5,l6)); int p3 = pack_pixel_1to2(F1(l4,l5), F2(l3,l4,l5)); int p4 = pack_pixel_1to2(F1(l3,l4), F2(l2,l3,l4)); int p5 = pack_pixel_1to2(F1(l2,l3), F2(l1,l2,l3)); int p6 = pack_pixel_1to2(F1(l1,l2), F2(l0,l1,l2)); int p7 = pack_pixel_1to2(F1(l0,l1), F2(lt,l0,l1)); int p8 = pack_pixel_1to2(F1(lt,l0), F2(l0,lt,t0)); int p9 = pack_pixel_1to2(F2(t1,t0,lt), F2(t2,t1,t0)); int p10 = pack_pixel_1to2(F2(t3,t2,t1), F2(t4,t3,t2)); int p11 = pack_pixel_1to2(F2(t5,t4,t3), F2(t6,t5,t4)); SRC_X4(0,7)= pack_pixel_2to4(p1,p2); SRC_X4(0,6)= pack_pixel_2to4(p2,p3); SRC_X4(4,7)=SRC_X4(0,5)= pack_pixel_2to4(p3,p4); SRC_X4(4,6)=SRC_X4(0,4)= pack_pixel_2to4(p4,p5); SRC_X4(4,5)=SRC_X4(0,3)= pack_pixel_2to4(p5,p6); SRC_X4(4,4)=SRC_X4(0,2)= pack_pixel_2to4(p6,p7); SRC_X4(4,3)=SRC_X4(0,1)= pack_pixel_2to4(p7,p8); SRC_X4(4,2)=SRC_X4(0,0)= pack_pixel_2to4(p8,p9); SRC_X4(4,1)= pack_pixel_2to4(p9,p10); SRC_X4(4,0)= pack_pixel_2to4(p10,p11); } static void predict_8x8_vl_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_TOP PREDICT_8x8_LOAD_TOPRIGHT SRC(0,0)= F1(t0,t1); SRC(0,1)= F2(t0,t1,t2); SRC(0,2)=SRC(1,0)= F1(t1,t2); SRC(0,3)=SRC(1,1)= F2(t1,t2,t3); SRC(0,4)=SRC(1,2)=SRC(2,0)= F1(t2,t3); SRC(0,5)=SRC(1,3)=SRC(2,1)= F2(t2,t3,t4); SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= F1(t3,t4); SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= F2(t3,t4,t5); SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= F1(t4,t5); SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= F2(t4,t5,t6); SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= F1(t5,t6); SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= F2(t5,t6,t7); SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= F1(t6,t7); SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= F2(t6,t7,t8); SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= F1(t7,t8); SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= F2(t7,t8,t9); SRC(5,6)=SRC(6,4)=SRC(7,2)= F1(t8,t9); SRC(5,7)=SRC(6,5)=SRC(7,3)= F2(t8,t9,t10); SRC(6,6)=SRC(7,4)= F1(t9,t10); SRC(6,7)=SRC(7,5)= F2(t9,t10,t11); SRC(7,6)= F1(t10,t11); SRC(7,7)= F2(t10,t11,t12); } static void predict_8x8_hu_c( pixel *src, pixel edge[36] ) { PREDICT_8x8_LOAD_LEFT int p1 = pack_pixel_1to2(F1(l0,l1), F2(l0,l1,l2)); int p2 = pack_pixel_1to2(F1(l1,l2), F2(l1,l2,l3)); int p3 = pack_pixel_1to2(F1(l2,l3), F2(l2,l3,l4)); int p4 = pack_pixel_1to2(F1(l3,l4), F2(l3,l4,l5)); int p5 = pack_pixel_1to2(F1(l4,l5), F2(l4,l5,l6)); int p6 = pack_pixel_1to2(F1(l5,l6), F2(l5,l6,l7)); int p7 = pack_pixel_1to2(F1(l6,l7), F2(l6,l7,l7)); int p8 = pack_pixel_1to2(l7,l7); SRC_X4(0,0)= pack_pixel_2to4(p1,p2); SRC_X4(0,1)= pack_pixel_2to4(p2,p3); SRC_X4(4,0)=SRC_X4(0,2)= pack_pixel_2to4(p3,p4); SRC_X4(4,1)=SRC_X4(0,3)= pack_pixel_2to4(p4,p5); SRC_X4(4,2)=SRC_X4(0,4)= pack_pixel_2to4(p5,p6); SRC_X4(4,3)=SRC_X4(0,5)= pack_pixel_2to4(p6,p7); SRC_X4(4,4)=SRC_X4(0,6)= pack_pixel_2to4(p7,p8); SRC_X4(4,5)=SRC_X4(4,6)= SRC_X4(0,7) = SRC_X4(4,7) = pack_pixel_2to4(p8,p8); } /**************************************************************************** * Exported functions: ****************************************************************************/ void x264_predict_16x16_init( uint32_t cpu, x264_predict_t pf[7] ) { pf[I_PRED_16x16_V ] = x264_predict_16x16_v_c; pf[I_PRED_16x16_H ] = x264_predict_16x16_h_c; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_c; pf[I_PRED_16x16_P ] = x264_predict_16x16_p_c; pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left_c; pf[I_PRED_16x16_DC_TOP ]= predict_16x16_dc_top_c; pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128_c; #if HAVE_MMX x264_predict_16x16_init_mmx( cpu, pf ); #endif #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) x264_predict_16x16_init_altivec( pf ); #endif #if HAVE_ARMV6 x264_predict_16x16_init_arm( cpu, pf ); #endif #if HAVE_AARCH64 x264_predict_16x16_init_aarch64( cpu, pf ); #endif #if !HIGH_BIT_DEPTH #if HAVE_MSA if( cpu&X264_CPU_MSA ) { pf[I_PRED_16x16_V ] = x264_intra_predict_vert_16x16_msa; pf[I_PRED_16x16_H ] = x264_intra_predict_hor_16x16_msa; pf[I_PRED_16x16_DC] = x264_intra_predict_dc_16x16_msa; pf[I_PRED_16x16_P ] = x264_intra_predict_plane_16x16_msa; pf[I_PRED_16x16_DC_LEFT]= x264_intra_predict_dc_left_16x16_msa; pf[I_PRED_16x16_DC_TOP ]= x264_intra_predict_dc_top_16x16_msa; pf[I_PRED_16x16_DC_128 ]= x264_intra_predict_dc_128_16x16_msa; } #endif #endif #if HAVE_LSX x264_predict_16x16_init_loongarch( cpu, pf ); #endif } void x264_predict_8x8c_init( uint32_t cpu, x264_predict_t pf[7] ) { pf[I_PRED_CHROMA_V ] = x264_predict_8x8c_v_c; pf[I_PRED_CHROMA_H ] = x264_predict_8x8c_h_c; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_c; pf[I_PRED_CHROMA_P ] = x264_predict_8x8c_p_c; pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left_c; pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top_c; pf[I_PRED_CHROMA_DC_128 ]= predict_8x8c_dc_128_c; #if HAVE_MMX x264_predict_8x8c_init_mmx( cpu, pf ); #endif #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) x264_predict_8x8c_init_altivec( pf ); #endif #if HAVE_ARMV6 x264_predict_8x8c_init_arm( cpu, pf ); #endif #if HAVE_AARCH64 x264_predict_8x8c_init_aarch64( cpu, pf ); #endif #if !HIGH_BIT_DEPTH #if HAVE_MSA if( cpu&X264_CPU_MSA ) { pf[I_PRED_CHROMA_P ] = x264_intra_predict_plane_8x8_msa; } #endif #endif #if HAVE_LSX x264_predict_8x8c_init_loongarch( cpu, pf ); #endif } void x264_predict_8x16c_init( uint32_t cpu, x264_predict_t pf[7] ) { pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_c; pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_c; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_c; pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_c; pf[I_PRED_CHROMA_DC_LEFT]= predict_8x16c_dc_left_c; pf[I_PRED_CHROMA_DC_TOP ]= predict_8x16c_dc_top_c; pf[I_PRED_CHROMA_DC_128 ]= predict_8x16c_dc_128_c; #if HAVE_MMX x264_predict_8x16c_init_mmx( cpu, pf ); #endif #if HAVE_ARMV6 x264_predict_8x16c_init_arm( cpu, pf ); #endif #if HAVE_AARCH64 x264_predict_8x16c_init_aarch64( cpu, pf ); #endif } void x264_predict_8x8_init( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) { pf[I_PRED_8x8_V] = x264_predict_8x8_v_c; pf[I_PRED_8x8_H] = x264_predict_8x8_h_c; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_c; pf[I_PRED_8x8_DDL] = predict_8x8_ddl_c; pf[I_PRED_8x8_DDR] = predict_8x8_ddr_c; pf[I_PRED_8x8_VR] = predict_8x8_vr_c; pf[I_PRED_8x8_HD] = predict_8x8_hd_c; pf[I_PRED_8x8_VL] = predict_8x8_vl_c; pf[I_PRED_8x8_HU] = predict_8x8_hu_c; pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_c; pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_c; pf[I_PRED_8x8_DC_128] = predict_8x8_dc_128_c; *predict_filter = predict_8x8_filter_c; #if HAVE_MMX x264_predict_8x8_init_mmx( cpu, pf, predict_filter ); #endif #if HAVE_ARMV6 x264_predict_8x8_init_arm( cpu, pf, predict_filter ); #endif #if HAVE_AARCH64 x264_predict_8x8_init_aarch64( cpu, pf, predict_filter ); #endif #if !HIGH_BIT_DEPTH #if HAVE_MSA if( cpu&X264_CPU_MSA ) { pf[I_PRED_8x8_DDL] = x264_intra_predict_ddl_8x8_msa; } #endif #endif #if HAVE_LSX x264_predict_8x8_init_loongarch( cpu, pf, predict_filter ); #endif } void x264_predict_4x4_init( uint32_t cpu, x264_predict_t pf[12] ) { pf[I_PRED_4x4_V] = x264_predict_4x4_v_c; pf[I_PRED_4x4_H] = x264_predict_4x4_h_c; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_c; pf[I_PRED_4x4_DDL] = predict_4x4_ddl_c; pf[I_PRED_4x4_DDR] = predict_4x4_ddr_c; pf[I_PRED_4x4_VR] = predict_4x4_vr_c; pf[I_PRED_4x4_HD] = predict_4x4_hd_c; pf[I_PRED_4x4_VL] = predict_4x4_vl_c; pf[I_PRED_4x4_HU] = predict_4x4_hu_c; pf[I_PRED_4x4_DC_LEFT]= predict_4x4_dc_left_c; pf[I_PRED_4x4_DC_TOP] = predict_4x4_dc_top_c; pf[I_PRED_4x4_DC_128] = predict_4x4_dc_128_c; #if HAVE_MMX x264_predict_4x4_init_mmx( cpu, pf ); #endif #if HAVE_ARMV6 x264_predict_4x4_init_arm( cpu, pf ); #endif #if HAVE_AARCH64 x264_predict_4x4_init_aarch64( cpu, pf ); #endif #if HAVE_LSX x264_predict_4x4_init_loongarch( cpu, pf ); #endif } x264-master/common/predict.h000066400000000000000000000136611502133446700161610ustar00rootroot00000000000000/***************************************************************************** * predict.h: intra prediction ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_PREDICT_H #define X264_PREDICT_H typedef void (*x264_predict_t)( pixel *src ); typedef void (*x264_predict8x8_t)( pixel *src, pixel edge[36] ); typedef void (*x264_predict_8x8_filter_t)( pixel *src, pixel edge[36], int i_neighbor, int i_filters ); enum intra_chroma_pred_e { I_PRED_CHROMA_DC = 0, I_PRED_CHROMA_H = 1, I_PRED_CHROMA_V = 2, I_PRED_CHROMA_P = 3, I_PRED_CHROMA_DC_LEFT = 4, I_PRED_CHROMA_DC_TOP = 5, I_PRED_CHROMA_DC_128 = 6 }; static const uint8_t x264_mb_chroma_pred_mode_fix[7] = { I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P, I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC }; enum intra16x16_pred_e { I_PRED_16x16_V = 0, I_PRED_16x16_H = 1, I_PRED_16x16_DC = 2, I_PRED_16x16_P = 3, I_PRED_16x16_DC_LEFT = 4, I_PRED_16x16_DC_TOP = 5, I_PRED_16x16_DC_128 = 6, }; static const uint8_t x264_mb_pred_mode16x16_fix[7] = { I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, I_PRED_16x16_DC,I_PRED_16x16_DC,I_PRED_16x16_DC }; enum intra4x4_pred_e { I_PRED_4x4_V = 0, I_PRED_4x4_H = 1, I_PRED_4x4_DC = 2, I_PRED_4x4_DDL= 3, I_PRED_4x4_DDR= 4, I_PRED_4x4_VR = 5, I_PRED_4x4_HD = 6, I_PRED_4x4_VL = 7, I_PRED_4x4_HU = 8, I_PRED_4x4_DC_LEFT = 9, I_PRED_4x4_DC_TOP = 10, I_PRED_4x4_DC_128 = 11, }; static const int8_t x264_mb_pred_mode4x4_fix[13] = { -1, I_PRED_4x4_V, I_PRED_4x4_H, I_PRED_4x4_DC, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, I_PRED_4x4_DC, I_PRED_4x4_DC, I_PRED_4x4_DC }; #define x264_mb_pred_mode4x4_fix(t) x264_mb_pred_mode4x4_fix[(t)+1] /* must use the same numbering as intra4x4_pred_e */ enum intra8x8_pred_e { I_PRED_8x8_V = 0, I_PRED_8x8_H = 1, I_PRED_8x8_DC = 2, I_PRED_8x8_DDL= 3, I_PRED_8x8_DDR= 4, I_PRED_8x8_VR = 5, I_PRED_8x8_HD = 6, I_PRED_8x8_VL = 7, I_PRED_8x8_HU = 8, I_PRED_8x8_DC_LEFT = 9, I_PRED_8x8_DC_TOP = 10, I_PRED_8x8_DC_128 = 11, }; #define x264_predict_8x8_dc_c x264_template(predict_8x8_dc_c) void x264_predict_8x8_dc_c ( pixel *src, pixel edge[36] ); #define x264_predict_8x8_h_c x264_template(predict_8x8_h_c) void x264_predict_8x8_h_c ( pixel *src, pixel edge[36] ); #define x264_predict_8x8_v_c x264_template(predict_8x8_v_c) void x264_predict_8x8_v_c ( pixel *src, pixel edge[36] ); #define x264_predict_4x4_dc_c x264_template(predict_4x4_dc_c) void x264_predict_4x4_dc_c ( pixel *src ); #define x264_predict_4x4_h_c x264_template(predict_4x4_h_c) void x264_predict_4x4_h_c ( pixel *src ); #define x264_predict_4x4_v_c x264_template(predict_4x4_v_c) void x264_predict_4x4_v_c ( pixel *src ); #define x264_predict_16x16_dc_c x264_template(predict_16x16_dc_c) void x264_predict_16x16_dc_c( pixel *src ); #define x264_predict_16x16_h_c x264_template(predict_16x16_h_c) void x264_predict_16x16_h_c ( pixel *src ); #define x264_predict_16x16_v_c x264_template(predict_16x16_v_c) void x264_predict_16x16_v_c ( pixel *src ); #define x264_predict_16x16_p_c x264_template(predict_16x16_p_c) void x264_predict_16x16_p_c ( pixel *src ); #define x264_predict_8x8c_dc_c x264_template(predict_8x8c_dc_c) void x264_predict_8x8c_dc_c ( pixel *src ); #define x264_predict_8x8c_h_c x264_template(predict_8x8c_h_c) void x264_predict_8x8c_h_c ( pixel *src ); #define x264_predict_8x8c_v_c x264_template(predict_8x8c_v_c) void x264_predict_8x8c_v_c ( pixel *src ); #define x264_predict_8x8c_p_c x264_template(predict_8x8c_p_c) void x264_predict_8x8c_p_c ( pixel *src ); #define x264_predict_8x16c_dc_c x264_template(predict_8x16c_dc_c) void x264_predict_8x16c_dc_c( pixel *src ); #define x264_predict_8x16c_h_c x264_template(predict_8x16c_h_c) void x264_predict_8x16c_h_c ( pixel *src ); #define x264_predict_8x16c_v_c x264_template(predict_8x16c_v_c) void x264_predict_8x16c_v_c ( pixel *src ); #define x264_predict_8x16c_p_c x264_template(predict_8x16c_p_c) void x264_predict_8x16c_p_c ( pixel *src ); #define x264_predict_16x16_init x264_template(predict_16x16_init) void x264_predict_16x16_init ( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_8x8c_init x264_template(predict_8x8c_init) void x264_predict_8x8c_init ( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_8x16c_init x264_template(predict_8x16c_init) void x264_predict_8x16c_init ( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_4x4_init x264_template(predict_4x4_init) void x264_predict_4x4_init ( uint32_t cpu, x264_predict_t pf[12] ); #define x264_predict_8x8_init x264_template(predict_8x8_init) void x264_predict_8x8_init ( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); #endif x264-master/common/quant.c000066400000000000000000000752071502133446700156560ustar00rootroot00000000000000/***************************************************************************** * quant.c: quantization and level-run ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser * Christian Heine * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #if HAVE_MMX #include "x86/quant.h" #endif #if HAVE_ALTIVEC # include "ppc/quant.h" #endif #if HAVE_ARMV6 # include "arm/quant.h" #endif #if HAVE_AARCH64 # include "aarch64/quant.h" #endif #if HAVE_MSA # include "mips/quant.h" #endif #if HAVE_LSX # include "loongarch/quant.h" #endif #define QUANT_ONE( coef, mf, f ) \ { \ if( (coef) > 0 ) \ (coef) = ((f) + (uint32_t)(coef)) * (mf) >> 16; \ else \ (coef) = -(int32_t)(((f) + (uint32_t)(-coef)) * (mf) >> 16); \ nz |= (coef); \ } static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ) { int nz = 0; for( int i = 0; i < 64; i++ ) QUANT_ONE( dct[i], mf[i], bias[i] ); return !!nz; } static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ) { int nz = 0; for( int i = 0; i < 16; i++ ) QUANT_ONE( dct[i], mf[i], bias[i] ); return !!nz; } static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ) { int nza = 0; for( int j = 0; j < 4; j++ ) { int nz = 0; for( int i = 0; i < 16; i++ ) QUANT_ONE( dct[j][i], mf[i], bias[i] ); nza |= (!!nz)<> (-i_qbits) static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) { const int i_mf = i_qp%6; const int i_qbits = i_qp/6 - 4; if( i_qbits >= 0 ) { for( int i = 0; i < 16; i++ ) DEQUANT_SHL( i ); } else { const int f = 1 << (-i_qbits-1); for( int i = 0; i < 16; i++ ) DEQUANT_SHR( i ); } } static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp ) { const int i_mf = i_qp%6; const int i_qbits = i_qp/6 - 6; if( i_qbits >= 0 ) { for( int i = 0; i < 64; i++ ) DEQUANT_SHL( i ); } else { const int f = 1 << (-i_qbits-1); for( int i = 0; i < 64; i++ ) DEQUANT_SHR( i ); } } static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) { const int i_qbits = i_qp/6 - 6; if( i_qbits >= 0 ) { const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits; for( int i = 0; i < 16; i++ ) dct[i] *= i_dmf; } else { const int i_dmf = dequant_mf[i_qp%6][0]; const int f = 1 << (-i_qbits-1); for( int i = 0; i < 16; i++ ) dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits); } } #define IDCT_DEQUANT_2X4_START \ int a0 = dct[0] + dct[1]; \ int a1 = dct[2] + dct[3]; \ int a2 = dct[4] + dct[5]; \ int a3 = dct[6] + dct[7]; \ int a4 = dct[0] - dct[1]; \ int a5 = dct[2] - dct[3]; \ int a6 = dct[4] - dct[5]; \ int a7 = dct[6] - dct[7]; \ int b0 = a0 + a1; \ int b1 = a2 + a3; \ int b2 = a4 + a5; \ int b3 = a6 + a7; \ int b4 = a0 - a1; \ int b5 = a2 - a3; \ int b6 = a4 - a5; \ int b7 = a6 - a7; static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ) { IDCT_DEQUANT_2X4_START int dmf = dequant_mf[i_qp%6][0] << i_qp/6; dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6; dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6; dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6; dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6; dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6; dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6; dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6; dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6; } static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp ) { IDCT_DEQUANT_2X4_START int dmf = dequant_mf[i_qp%6][0] << i_qp/6; dct[0] = ((b0 + b1) * dmf + 32) >> 6; dct[1] = ((b2 + b3) * dmf + 32) >> 6; dct[2] = ((b0 - b1) * dmf + 32) >> 6; dct[3] = ((b2 - b3) * dmf + 32) >> 6; dct[4] = ((b4 - b5) * dmf + 32) >> 6; dct[5] = ((b6 - b7) * dmf + 32) >> 6; dct[6] = ((b4 + b5) * dmf + 32) >> 6; dct[7] = ((b6 + b7) * dmf + 32) >> 6; } static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf ) { IDCT_DEQUANT_2X4_START out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */ out[1] = ((b2 + b3) * dmf + 2080) >> 6; out[2] = ((b0 - b1) * dmf + 2080) >> 6; out[3] = ((b2 - b3) * dmf + 2080) >> 6; out[4] = ((b4 - b5) * dmf + 2080) >> 6; out[5] = ((b6 - b7) * dmf + 2080) >> 6; out[6] = ((b4 + b5) * dmf + 2080) >> 6; out[7] = ((b6 + b7) * dmf + 2080) >> 6; } #undef IDCT_DEQUANT_2X4_START static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf ) { int d0 = dct[0] + dct[1]; int d1 = dct[2] + dct[3]; int d2 = dct[0] - dct[1]; int d3 = dct[2] - dct[3]; out[0] = ((d0 + d1) * dmf >> 5) + 32; out[1] = ((d0 - d1) * dmf >> 5) + 32; out[2] = ((d2 + d3) * dmf >> 5) + 32; out[3] = ((d2 - d3) * dmf >> 5) + 32; } static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 ) { dctcoef out[8]; if( chroma422 ) optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf ); else optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf ); int sum = 0; for( int i = 0; i < (chroma422?8:4); i++ ) sum |= ref[i] ^ out[i]; return sum >> 6; } static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 ) { /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */ dctcoef dct_orig[8]; int coeff, nz; if( chroma422 ) optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf ); else optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf ); /* If the DC coefficients already round to zero, terminate early. */ int sum = 0; for( int i = 0; i < (chroma422?8:4); i++ ) sum |= dct_orig[i]; if( !(sum >> 6) ) return 0; /* Start with the highest frequency coefficient... is this the best option? */ for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- ) { int level = dct[coeff]; int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */ while( level ) { dct[coeff] = level - sign; if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) ) { nz = 1; dct[coeff] = level; break; } level -= sign; } } return nz; } static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf ) { return optimize_chroma_dc_internal( dct, dequant_mf, 0 ); } static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf ) { return optimize_chroma_dc_internal( dct, dequant_mf, 1 ); } static void denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ) { for( int i = 0; i < size; i++ ) { int level = dct[i]; int sign = level>>31; level = (level+sign)^sign; sum[i] += level; level -= offset[i]; dct[i] = level<0 ? 0 : (level^sign)-sign; } } /* (ref: JVT-B118) * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs * to 0 (low score means set it to null) * Used in inter macroblock (luma and chroma) * luma: for a 8x8 block: if score < 4 -> null * for the complete mb: if score < 6 -> null * chroma: for the complete mb: if score < 7 -> null */ static ALWAYS_INLINE int decimate_score_internal( dctcoef *dct, int i_max ) { const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4; int i_score = 0; int idx = i_max - 1; while( idx >= 0 && dct[idx] == 0 ) idx--; while( idx >= 0 ) { int i_run; if( (unsigned)(dct[idx--] + 1) > 2 ) return 9; i_run = 0; while( idx >= 0 && dct[idx] == 0 ) { idx--; i_run++; } i_score += ds_table[i_run]; } return i_score; } static int decimate_score15( dctcoef *dct ) { return decimate_score_internal( dct+1, 15 ); } static int decimate_score16( dctcoef *dct ) { return decimate_score_internal( dct, 16 ); } static int decimate_score64( dctcoef *dct ) { return decimate_score_internal( dct, 64 ); } #define last(num)\ static int coeff_last##num( dctcoef *l )\ {\ int i_last = num-1;\ while( i_last >= 0 && l[i_last] == 0 )\ i_last--;\ return i_last;\ } last(4) last(8) last(15) last(16) last(64) #define level_run(num)\ static int coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\ {\ int i_last = runlevel->last = coeff_last##num(dct);\ int i_total = 0;\ int mask = 0;\ do\ {\ runlevel->level[i_total++] = dct[i_last];\ mask |= 1 << (i_last);\ while( --i_last >= 0 && dct[i_last] == 0 );\ } while( i_last >= 0 );\ runlevel->mask = mask;\ return i_total;\ } level_run(4) level_run(8) level_run(15) level_run(16) #if ARCH_X86_64 #define INIT_TRELLIS(cpu)\ pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\ pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\ pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\ pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\ pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\ pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu; #else #define INIT_TRELLIS(...) #endif void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ) { pf->quant_8x8 = quant_8x8; pf->quant_4x4 = quant_4x4; pf->quant_4x4x4 = quant_4x4x4; pf->quant_4x4_dc = quant_4x4_dc; pf->quant_2x2_dc = quant_2x2_dc; pf->dequant_4x4 = dequant_4x4; pf->dequant_4x4_dc = dequant_4x4_dc; pf->dequant_8x8 = dequant_8x8; pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc; pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly; pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc; pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc; pf->denoise_dct = denoise_dct; pf->decimate_score15 = decimate_score15; pf->decimate_score16 = decimate_score16; pf->decimate_score64 = decimate_score64; pf->coeff_last4 = coeff_last4; pf->coeff_last8 = coeff_last8; pf->coeff_last[ DCT_LUMA_AC] = coeff_last15; pf->coeff_last[ DCT_LUMA_4x4] = coeff_last16; pf->coeff_last[ DCT_LUMA_8x8] = coeff_last64; pf->coeff_level_run4 = coeff_level_run4; pf->coeff_level_run8 = coeff_level_run8; pf->coeff_level_run[ DCT_LUMA_AC] = coeff_level_run15; pf->coeff_level_run[ DCT_LUMA_4x4] = coeff_level_run16; #if HIGH_BIT_DEPTH #if HAVE_MMX INIT_TRELLIS( sse2 ); if( cpu&X264_CPU_MMX2 ) { #if ARCH_X86 pf->denoise_dct = x264_denoise_dct_mmx; pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2; pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2; #endif pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; } if( cpu&X264_CPU_SSE2 ) { pf->quant_4x4 = x264_quant_4x4_sse2; pf->quant_4x4x4 = x264_quant_4x4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; pf->quant_2x2_dc = x264_quant_2x2_dc_sse2; pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; pf->dequant_4x4 = x264_dequant_4x4_sse2; pf->dequant_8x8 = x264_dequant_8x8_sse2; pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2; pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2; pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; pf->decimate_score64 = x264_decimate_score64_sse2; pf->coeff_last8 = x264_coeff_last8_sse2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_level_run8 = x264_coeff_level_run8_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; } if( cpu&X264_CPU_LZCNT ) { pf->coeff_last4 = x264_coeff_last4_lzcnt; pf->coeff_last8 = x264_coeff_last8_lzcnt; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) { pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_4x4x4 = x264_quant_4x4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3; pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); } if( cpu&X264_CPU_SSE4 ) { pf->quant_2x2_dc = x264_quant_2x2_dc_sse4; pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; pf->quant_4x4x4 = x264_quant_4x4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; } if( cpu&X264_CPU_AVX ) { pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx; pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx; pf->denoise_dct = x264_denoise_dct_avx; } if( cpu&X264_CPU_XOP ) { pf->dequant_4x4_dc = x264_dequant_4x4dc_xop; if( h->param.i_cqm_preset != X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_xop; pf->dequant_8x8 = x264_dequant_8x8_xop; } } if( cpu&X264_CPU_AVX2 ) { pf->quant_4x4 = x264_quant_4x4_avx2; pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; pf->quant_8x8 = x264_quant_8x8_avx2; pf->quant_4x4x4 = x264_quant_4x4x4_avx2; pf->dequant_4x4 = x264_dequant_4x4_avx2; pf->dequant_8x8 = x264_dequant_8x8_avx2; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; pf->denoise_dct = x264_denoise_dct_avx2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; } if( cpu&X264_CPU_AVX512 ) { pf->dequant_4x4 = x264_dequant_4x4_avx512; pf->dequant_8x8 = x264_dequant_8x8_avx512; pf->decimate_score15 = x264_decimate_score15_avx512; pf->decimate_score16 = x264_decimate_score16_avx512; pf->decimate_score64 = x264_decimate_score64_avx512; pf->coeff_last4 = x264_coeff_last4_avx512; pf->coeff_last8 = x264_coeff_last8_avx512; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX #if HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { pf->quant_2x2_dc = x264_quant_2x2_dc_neon; pf->quant_4x4_dc = x264_quant_4x4_dc_neon; pf->quant_4x4 = x264_quant_4x4_neon; pf->quant_4x4x4 = x264_quant_4x4x4_neon; pf->quant_8x8 = x264_quant_8x8_neon; pf->dequant_4x4 = x264_dequant_4x4_neon; pf->dequant_8x8 = x264_dequant_8x8_neon; pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon; pf->decimate_score15 = x264_decimate_score15_neon; pf->decimate_score16 = x264_decimate_score16_neon; pf->decimate_score64 = x264_decimate_score64_neon; pf->coeff_last4 = x264_coeff_last4_neon; pf->coeff_last8 = x264_coeff_last8_neon; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; pf->coeff_level_run4 = x264_coeff_level_run4_neon; pf->coeff_level_run8 = x264_coeff_level_run8_neon; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon; pf->denoise_dct = x264_denoise_dct_neon; } #endif // HAVE_AARCH64 #else // !HIGH_BIT_DEPTH #if HAVE_MMX INIT_TRELLIS( sse2 ); if( cpu&X264_CPU_MMX ) { #if ARCH_X86 pf->dequant_4x4 = x264_dequant_4x4_mmx; pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2; pf->dequant_8x8 = x264_dequant_8x8_mmx; if( h->param.i_cqm_preset == X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx; pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx; } pf->denoise_dct = x264_denoise_dct_mmx; #endif } if( cpu&X264_CPU_MMX2 ) { pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2; #if ARCH_X86 pf->quant_4x4 = x264_quant_4x4_mmx2; pf->quant_8x8 = x264_quant_8x8_mmx2; pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2; #endif pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; } if( cpu&X264_CPU_SSE2 ) { pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; pf->quant_4x4 = x264_quant_4x4_sse2; pf->quant_4x4x4 = x264_quant_4x4x4_sse2; pf->quant_8x8 = x264_quant_8x8_sse2; pf->dequant_4x4 = x264_dequant_4x4_sse2; pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2; pf->dequant_8x8 = x264_dequant_8x8_sse2; if( h->param.i_cqm_preset == X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2; pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; pf->decimate_score64 = x264_decimate_score64_sse2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; } if( cpu&X264_CPU_LZCNT ) { pf->coeff_last4 = x264_coeff_last4_lzcnt; pf->coeff_last8 = x264_coeff_last8_lzcnt; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) { pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3; pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_4x4x4 = x264_quant_4x4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; if( cpu&X264_CPU_LZCNT ) { pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt; pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; } } if( cpu&X264_CPU_SSE4 ) { pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4; } if( cpu&X264_CPU_AVX ) { pf->dequant_4x4_dc = x264_dequant_4x4dc_avx; if( h->param.i_cqm_preset != X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_avx; pf->dequant_8x8 = x264_dequant_8x8_avx; } pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx; pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx; pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx; pf->denoise_dct = x264_denoise_dct_avx; } if( cpu&X264_CPU_XOP ) { if( h->param.i_cqm_preset != X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_xop; pf->dequant_8x8 = x264_dequant_8x8_xop; } } if( cpu&X264_CPU_AVX2 ) { pf->quant_4x4 = x264_quant_4x4_avx2; pf->quant_4x4_dc = x264_quant_4x4_dc_avx2; pf->quant_8x8 = x264_quant_8x8_avx2; pf->quant_4x4x4 = x264_quant_4x4x4_avx2; pf->dequant_4x4 = x264_dequant_4x4_avx2; pf->dequant_8x8 = x264_dequant_8x8_avx2; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; if( h->param.i_cqm_preset == X264_CQM_FLAT ) { pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2; pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2; } pf->decimate_score64 = x264_decimate_score64_avx2; pf->denoise_dct = x264_denoise_dct_avx2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2; } if( cpu&X264_CPU_AVX512 ) { if( h->param.i_cqm_preset == X264_CQM_FLAT ) pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512; else { pf->dequant_4x4 = x264_dequant_4x4_avx512; pf->dequant_8x8 = x264_dequant_8x8_avx512; } pf->decimate_score15 = x264_decimate_score15_avx512; pf->decimate_score16 = x264_decimate_score16_avx512; pf->decimate_score64 = x264_decimate_score64_avx512; pf->coeff_last8 = x264_coeff_last8_avx512; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { pf->quant_2x2_dc = x264_quant_2x2_dc_altivec; pf->quant_4x4_dc = x264_quant_4x4_dc_altivec; pf->quant_4x4 = x264_quant_4x4_altivec; pf->quant_4x4x4 = x264_quant_4x4x4_altivec; pf->quant_8x8 = x264_quant_8x8_altivec; pf->dequant_4x4 = x264_dequant_4x4_altivec; pf->dequant_8x8 = x264_dequant_8x8_altivec; } #endif #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) { pf->coeff_last4 = x264_coeff_last4_arm; pf->coeff_last8 = x264_coeff_last8_arm; } #endif #if HAVE_ARMV6 || HAVE_AARCH64 if( cpu&X264_CPU_NEON ) { pf->quant_2x2_dc = x264_quant_2x2_dc_neon; pf->quant_4x4 = x264_quant_4x4_neon; pf->quant_4x4_dc = x264_quant_4x4_dc_neon; pf->quant_4x4x4 = x264_quant_4x4x4_neon; pf->quant_8x8 = x264_quant_8x8_neon; pf->dequant_4x4 = x264_dequant_4x4_neon; pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon; pf->dequant_8x8 = x264_dequant_8x8_neon; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; pf->denoise_dct = x264_denoise_dct_neon; pf->decimate_score15 = x264_decimate_score15_neon; pf->decimate_score16 = x264_decimate_score16_neon; pf->decimate_score64 = x264_decimate_score64_neon; } #endif #if HAVE_AARCH64 if( cpu&X264_CPU_ARMV8 ) { pf->coeff_last4 = x264_coeff_last4_aarch64; pf->coeff_last8 = x264_coeff_last8_aarch64; pf->coeff_level_run4 = x264_coeff_level_run4_aarch64; } if( cpu&X264_CPU_NEON ) { pf->coeff_level_run8 = x264_coeff_level_run8_neon; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon; } #endif #if HAVE_MSA if( cpu&X264_CPU_MSA ) { pf->quant_4x4 = x264_quant_4x4_msa; pf->quant_4x4_dc = x264_quant_4x4_dc_msa; pf->quant_4x4x4 = x264_quant_4x4x4_msa; pf->quant_8x8 = x264_quant_8x8_msa; pf->dequant_4x4 = x264_dequant_4x4_msa; pf->dequant_4x4_dc = x264_dequant_4x4_dc_msa; pf->dequant_8x8 = x264_dequant_8x8_msa; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_msa; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa; } #endif #if HAVE_LSX if( cpu&X264_CPU_LSX ) { pf->quant_4x4 = x264_quant_4x4_lsx; pf->quant_4x4x4 = x264_quant_4x4x4_lsx; pf->quant_8x8 = x264_quant_8x8_lsx; pf->quant_4x4_dc = x264_quant_4x4_dc_lsx; pf->quant_2x2_dc = x264_quant_2x2_dc_lsx; pf->dequant_4x4 = x264_dequant_4x4_lsx; pf->dequant_8x8 = x264_dequant_8x8_lsx; pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx; pf->decimate_score15 = x264_decimate_score15_lsx; pf->decimate_score16 = x264_decimate_score16_lsx; pf->decimate_score64 = x264_decimate_score64_lsx; pf->coeff_last4 = x264_coeff_last4_lsx; pf->coeff_last8 = x264_coeff_last8_lsx; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx; pf->coeff_level_run8 = x264_coeff_level_run8_lsx; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx; } if( cpu&X264_CPU_LASX ) { pf->quant_4x4x4 = x264_quant_4x4x4_lasx; pf->dequant_4x4 = x264_dequant_4x4_lasx; pf->dequant_8x8 = x264_dequant_8x8_lasx; pf->dequant_4x4_dc = x264_dequant_4x4_dc_lasx; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx; } #endif #endif // HIGH_BIT_DEPTH pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] = pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4]; pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[DCT_CHROMAU_AC] = pf->coeff_last[DCT_CHROMAV_AC] = pf->coeff_last[DCT_LUMA_AC]; pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8]; pf->coeff_level_run[DCT_LUMA_DC] = pf->coeff_level_run[DCT_CHROMAU_DC] = pf->coeff_level_run[DCT_CHROMAV_DC] = pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4]; pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[DCT_CHROMAU_AC] = pf->coeff_level_run[DCT_CHROMAV_AC] = pf->coeff_level_run[DCT_LUMA_AC]; } x264-master/common/quant.h000066400000000000000000000072521502133446700156560ustar00rootroot00000000000000/***************************************************************************** * quant.h: quantization and level-run ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_QUANT_H #define X264_QUANT_H typedef struct { int (*quant_8x8) ( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); int (*quant_4x4) ( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); int (*quant_4x4x4)( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias ); int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias ); void (*dequant_8x8)( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf ); int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf ); void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); int (*decimate_score15)( dctcoef *dct ); int (*decimate_score16)( dctcoef *dct ); int (*decimate_score64)( dctcoef *dct ); int (*coeff_last[14])( dctcoef *dct ); int (*coeff_last4)( dctcoef *dct ); int (*coeff_last8)( dctcoef *dct ); int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel ); int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel ); int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel ); #define TRELLIS_PARAMS const int *unquant_mf, const uint8_t *zigzag, int lambda2,\ int last_nnz, dctcoef *coefs, dctcoef *quant_coefs, dctcoef *dct,\ uint8_t *cabac_state_sig, uint8_t *cabac_state_last,\ uint64_t level_state0, uint16_t level_state1 int (*trellis_cabac_4x4)( TRELLIS_PARAMS, int b_ac ); int (*trellis_cabac_8x8)( TRELLIS_PARAMS, int b_interlaced ); int (*trellis_cabac_4x4_psy)( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int psy_trellis ); int (*trellis_cabac_8x8_psy)( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int psy_trellis ); int (*trellis_cabac_dc)( TRELLIS_PARAMS, int num_coefs ); int (*trellis_cabac_chroma_422_dc)( TRELLIS_PARAMS ); } x264_quant_function_t; #define x264_quant_init x264_template(quant_init) void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ); #endif x264-master/common/rectangle.c000066400000000000000000000040401502133446700164550ustar00rootroot00000000000000/***************************************************************************** * rectangle.c: rectangle filling ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #define CACHE_FUNC(name,size,width,height)\ static void macroblock_cache_##name##_##width##_##height( void *target, uint32_t val )\ {\ x264_macroblock_cache_rect( target, width*size, height, size, val );\ } #define CACHE_FUNCS(name,size)\ CACHE_FUNC(name,size,4,4)\ CACHE_FUNC(name,size,2,4)\ CACHE_FUNC(name,size,4,2)\ CACHE_FUNC(name,size,2,2)\ CACHE_FUNC(name,size,2,1)\ CACHE_FUNC(name,size,1,2)\ CACHE_FUNC(name,size,1,1)\ void (*x264_cache_##name##_func_table[10])(void *, uint32_t) =\ {\ macroblock_cache_##name##_1_1,\ macroblock_cache_##name##_2_1,\ macroblock_cache_##name##_1_2,\ macroblock_cache_##name##_2_2,\ NULL,\ macroblock_cache_##name##_4_2,\ NULL,\ macroblock_cache_##name##_2_4,\ NULL,\ macroblock_cache_##name##_4_4\ };\ CACHE_FUNCS(mv, 4) CACHE_FUNCS(mvd, 2) CACHE_FUNCS(ref, 1) x264-master/common/rectangle.h000066400000000000000000000133421502133446700164670ustar00rootroot00000000000000/***************************************************************************** * rectangle.h: rectangle filling ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Fiona Glaser * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* This function should only be called with constant w / h / s arguments! */ static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, int s, uint32_t v ) { uint8_t *d = dst; uint16_t v2 = s >= 2 ? v : v * 0x101; uint32_t v4 = s >= 4 ? v : s >= 2 ? v * 0x10001 : v * 0x1010101; uint64_t v8 = v4 + ((uint64_t)v4 << 32); s *= 8; if( w == 2 ) { M16( d+s*0 ) = v2; if( h == 1 ) return; M16( d+s*1 ) = v2; if( h == 2 ) return; M16( d+s*2 ) = v2; M16( d+s*3 ) = v2; } else if( w == 4 ) { M32( d+s*0 ) = v4; if( h == 1 ) return; M32( d+s*1 ) = v4; if( h == 2 ) return; M32( d+s*2 ) = v4; M32( d+s*3 ) = v4; } else if( w == 8 ) { if( WORD_SIZE == 8 ) { M64( d+s*0 ) = v8; if( h == 1 ) return; M64( d+s*1 ) = v8; if( h == 2 ) return; M64( d+s*2 ) = v8; M64( d+s*3 ) = v8; } else { M32( d+s*0+0 ) = v4; M32( d+s*0+4 ) = v4; if( h == 1 ) return; M32( d+s*1+0 ) = v4; M32( d+s*1+4 ) = v4; if( h == 2 ) return; M32( d+s*2+0 ) = v4; M32( d+s*2+4 ) = v4; M32( d+s*3+0 ) = v4; M32( d+s*3+4 ) = v4; } } else if( w == 16 ) { /* height 1, width 16 doesn't occur */ assert( h != 1 ); #if HAVE_VECTOREXT && defined(__SSE__) v4si v16 = {v,v,v,v}; M128( d+s*0+0 ) = (__m128)v16; M128( d+s*1+0 ) = (__m128)v16; if( h == 2 ) return; M128( d+s*2+0 ) = (__m128)v16; M128( d+s*3+0 ) = (__m128)v16; #else if( WORD_SIZE == 8 ) { do { M64( d+s*0+0 ) = v8; M64( d+s*0+8 ) = v8; M64( d+s*1+0 ) = v8; M64( d+s*1+8 ) = v8; h -= 2; d += s*2; } while( h ); } else { do { M32( d+ 0 ) = v4; M32( d+ 4 ) = v4; M32( d+ 8 ) = v4; M32( d+12 ) = v4; d += s; } while( --h ); } #endif } else assert(0); } #define x264_cache_mv_func_table x264_template(cache_mv_func_table) extern void (*x264_cache_mv_func_table[10])(void *, uint32_t); #define x264_cache_mvd_func_table x264_template(cache_mvd_func_table) extern void (*x264_cache_mvd_func_table[10])(void *, uint32_t); #define x264_cache_ref_func_table x264_template(cache_ref_func_table) extern void (*x264_cache_ref_func_table[10])(void *, uint32_t); #define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) ) static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv ) { void *mv_cache = &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y]; if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) ) x264_cache_mv_func_table[width + (height<<1)-3]( mv_cache, mv ); else x264_macroblock_cache_rect( mv_cache, width*4, height, 4, mv ); } static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint16_t mvd ) { void *mvd_cache = &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y]; if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) ) x264_cache_mvd_func_table[width + (height<<1)-3]( mvd_cache, mvd ); else x264_macroblock_cache_rect( mvd_cache, width*2, height, 2, mvd ); } static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int8_t ref ) { void *ref_cache = &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y]; if( x264_nonconstant_p( width ) || x264_nonconstant_p( height ) ) x264_cache_ref_func_table[width + (height<<1)-3]( ref_cache, (uint8_t)ref ); else x264_macroblock_cache_rect( ref_cache, width, height, 1, (uint8_t)ref ); } static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip ) { x264_macroblock_cache_rect( &h->mb.cache.skip[X264_SCAN8_0+x+8*y], width, height, 1, b_skip ); } static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, int y, int i_mode ) { x264_macroblock_cache_rect( &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y], 2, 2, 1, i_mode ); } x264-master/common/set.c000066400000000000000000000334201502133446700153100ustar00rootroot00000000000000/***************************************************************************** * set.c: quantization init ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s)) #define DIV(n,d) (((n) + ((d)>>1)) / (d)) static const uint8_t dequant4_scale[6][3] = { { 10, 13, 16 }, { 11, 14, 18 }, { 13, 16, 20 }, { 14, 18, 23 }, { 16, 20, 25 }, { 18, 23, 29 } }; static const uint16_t quant4_scale[6][3] = { { 13107, 8066, 5243 }, { 11916, 7490, 4660 }, { 10082, 6554, 4194 }, { 9362, 5825, 3647 }, { 8192, 5243, 3355 }, { 7282, 4559, 2893 }, }; static const uint8_t quant8_scan[16] = { 0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1 }; static const uint8_t dequant8_scale[6][6] = { { 20, 18, 32, 19, 25, 24 }, { 22, 19, 35, 21, 28, 26 }, { 26, 23, 42, 24, 33, 31 }, { 28, 25, 45, 26, 35, 33 }, { 32, 28, 51, 30, 40, 38 }, { 36, 32, 58, 34, 46, 43 }, }; static const uint16_t quant8_scale[6][6] = { { 13107, 11428, 20972, 12222, 16777, 15481 }, { 11916, 10826, 19174, 11058, 14980, 14290 }, { 10082, 8943, 15978, 9675, 12710, 11985 }, { 9362, 8228, 14913, 8931, 11984, 11259 }, { 8192, 7346, 13159, 7740, 10486, 9777 }, { 7282, 6428, 11570, 6830, 9118, 8640 } }; int x264_cqm_init( x264_t *h ) { int def_quant4[6][16]; int def_quant8[6][64]; int def_dequant4[6][16]; int def_dequant8[6][64]; int quant4_mf[4][6][16]; int quant8_mf[4][6][64]; int deadzone[4] = { 32 - h->param.analyse.i_luma_deadzone[1], 32 - h->param.analyse.i_luma_deadzone[0], 32 - 11, 32 - 21 }; int max_qp_err = -1; int max_chroma_qp_err = -1; int min_qp_err = QP_MAX+1; int num_8x8_lists = h->sps->i_chroma_format_idc == CHROMA_444 ? 4 : h->param.analyse.b_transform_8x8 ? 2 : 0; /* Checkasm may segfault if optimized out by --chroma-format */ #define CQM_ALLOC( w, count )\ for( int i = 0; i < count; i++ )\ {\ int size = w*w;\ int start = w == 8 ? 4 : 0;\ int j;\ for( j = 0; j < i; j++ )\ if( !memcmp( h->sps->scaling_list[i+start], h->sps->scaling_list[j+start], size*sizeof(uint8_t) ) )\ break;\ if( j < i )\ {\ h-> quant##w##_mf[i] = h-> quant##w##_mf[j];\ h->dequant##w##_mf[i] = h->dequant##w##_mf[j];\ h->unquant##w##_mf[i] = h->unquant##w##_mf[j];\ }\ else\ {\ CHECKED_MALLOC( h-> quant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\ CHECKED_MALLOC( h->dequant##w##_mf[i], 6*size*sizeof(int) );\ CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(int) );\ }\ for( j = 0; j < i; j++ )\ if( deadzone[j] == deadzone[i] &&\ !memcmp( h->sps->scaling_list[i+start], h->sps->scaling_list[j+start], size*sizeof(uint8_t) ) )\ break;\ if( j < i )\ {\ h->quant##w##_bias[i] = h->quant##w##_bias[j];\ h->quant##w##_bias0[i] = h->quant##w##_bias0[j];\ }\ else\ {\ CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\ CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\ }\ } CQM_ALLOC( 4, 4 ) CQM_ALLOC( 8, num_8x8_lists ) for( int q = 0; q < 6; q++ ) { for( int i = 0; i < 16; i++ ) { int j = (i&1) + ((i>>2)&1); def_dequant4[q][i] = dequant4_scale[q][j]; def_quant4[q][i] = quant4_scale[q][j]; } for( int i = 0; i < 64; i++ ) { int j = quant8_scan[((i>>1)&12) | (i&3)]; def_dequant8[q][i] = dequant8_scale[q][j]; def_quant8[q][i] = quant8_scale[q][j]; } } for( int q = 0; q < 6; q++ ) { for( int i_list = 0; i_list < 4; i_list++ ) for( int i = 0; i < 16; i++ ) { h->dequant4_mf[i_list][q][i] = def_dequant4[q][i] * h->sps->scaling_list[i_list][i]; quant4_mf[i_list][q][i] = DIV(def_quant4[q][i] * 16, h->sps->scaling_list[i_list][i]); } for( int i_list = 0; i_list < num_8x8_lists; i_list++ ) for( int i = 0; i < 64; i++ ) { h->dequant8_mf[i_list][q][i] = def_dequant8[q][i] * h->sps->scaling_list[4+i_list][i]; quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->sps->scaling_list[4+i_list][i]); } } #define MAX_MF X264_MIN( 0xffff, (1 << (25 - BIT_DEPTH)) - 1 ) for( int q = 0; q <= QP_MAX_SPEC; q++ ) { int j; for( int i_list = 0; i_list < 4; i_list++ ) for( int i = 0; i < 16; i++ ) { h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i]; j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1); h->quant4_mf[i_list][q][i] = (uint16_t)j; if( !j ) { min_qp_err = X264_MIN( min_qp_err, q ); continue; } // round to nearest, unless that would cause the deadzone to be negative h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); h->quant4_bias0[i_list][q][i] = (1<<15)/j; if( j > MAX_MF && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) ) max_qp_err = q; if( j > MAX_MF && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) ) max_chroma_qp_err = q; } if( h->param.analyse.b_transform_8x8 ) for( int i_list = 0; i_list < num_8x8_lists; i_list++ ) for( int i = 0; i < 64; i++ ) { h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i]; j = SHIFT(quant8_mf[i_list][q%6][i], q/6); h->quant8_mf[i_list][q][i] = (uint16_t)j; if( !j ) { min_qp_err = X264_MIN( min_qp_err, q ); continue; } h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); h->quant8_bias0[i_list][q][i] = (1<<15)/j; if( j > MAX_MF && q > max_qp_err && (i_list == CQM_8IY || i_list == CQM_8PY) ) max_qp_err = q; if( j > MAX_MF && q > max_chroma_qp_err && (i_list == CQM_8IC || i_list == CQM_8PC) ) max_chroma_qp_err = q; } } /* Emergency mode denoising. */ x264_emms(); CHECKED_MALLOC( h->nr_offset_emergency, sizeof(*h->nr_offset_emergency)*(QP_MAX-QP_MAX_SPEC) ); for( int q = 0; q < QP_MAX - QP_MAX_SPEC; q++ ) for( int cat = 0; cat < 3 + CHROMA444; cat++ ) { int dct8x8 = cat&1; if( !h->param.analyse.b_transform_8x8 && dct8x8 ) continue; int size = dct8x8 ? 64 : 16; udctcoef *nr_offset = h->nr_offset_emergency[q][cat]; /* Denoise chroma first (due to h264's chroma QP offset), then luma, then DC. */ int dc_threshold = (QP_MAX-QP_MAX_SPEC)*2/3; int luma_threshold = (QP_MAX-QP_MAX_SPEC)*2/3; int chroma_threshold = 0; for( int i = 0; i < size; i++ ) { int max = (1 << (7 + BIT_DEPTH)) - 1; /* True "emergency mode": remove all DCT coefficients */ if( q == QP_MAX - QP_MAX_SPEC - 1 ) { nr_offset[i] = max; continue; } int thresh = i == 0 ? dc_threshold : cat >= 2 ? chroma_threshold : luma_threshold; if( q < thresh ) { nr_offset[i] = 0; continue; } double pos = (double)(q-thresh+1) / (QP_MAX - QP_MAX_SPEC - thresh); /* XXX: this math is largely tuned for /dev/random input. */ double start = dct8x8 ? h->unquant8_mf[CQM_8PY][QP_MAX_SPEC][i] : h->unquant4_mf[CQM_4PY][QP_MAX_SPEC][i]; /* Formula chosen as an exponential scale to vaguely mimic the effects * of a higher quantizer. */ double bias = (pow( 2, pos*(QP_MAX - QP_MAX_SPEC)/10. )*0.003-0.003) * start; nr_offset[i] = X264_MIN( bias + 0.5, max ); } } if( !h->mb.b_lossless ) { while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_min)] <= max_chroma_qp_err ) h->param.rc.i_qp_min++; if( min_qp_err <= h->param.rc.i_qp_max ) h->param.rc.i_qp_max = min_qp_err-1; if( max_qp_err >= h->param.rc.i_qp_min ) h->param.rc.i_qp_min = max_qp_err+1; /* If long level-codes aren't allowed, we need to allow QP high enough to avoid them. */ if( !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH ) while( h->chroma_qp_table[SPEC_QP(h->param.rc.i_qp_max)] <= 12 || h->param.rc.i_qp_max <= 12 ) h->param.rc.i_qp_max++; if( h->param.rc.i_qp_min > h->param.rc.i_qp_max ) { x264_log( h, X264_LOG_ERROR, "Impossible QP constraints for CQM (min=%d, max=%d)\n", h->param.rc.i_qp_min, h->param.rc.i_qp_max ); return -1; } } return 0; fail: x264_cqm_delete( h ); return -1; } #define CQM_DELETE( n, max )\ for( int i = 0; i < (max); i++ )\ {\ int j;\ for( j = 0; j < i; j++ )\ if( h->quant##n##_mf[i] == h->quant##n##_mf[j] )\ break;\ if( j == i )\ {\ x264_free( h-> quant##n##_mf[i] );\ x264_free( h->dequant##n##_mf[i] );\ x264_free( h->unquant##n##_mf[i] );\ }\ for( j = 0; j < i; j++ )\ if( h->quant##n##_bias[i] == h->quant##n##_bias[j] )\ break;\ if( j == i )\ {\ x264_free( h->quant##n##_bias[i] );\ x264_free( h->quant##n##_bias0[i] );\ }\ } void x264_cqm_delete( x264_t *h ) { CQM_DELETE( 4, 4 ); CQM_DELETE( 8, CHROMA444 ? 4 : 2 ); x264_free( h->nr_offset_emergency ); } static int cqm_parse_jmlist( x264_t *h, const char *buf, const char *name, uint8_t *cqm, const uint8_t *jvt, int length ) { int i; char *p = strstr( buf, name ); if( !p ) { memset( cqm, 16, length ); return 0; } p += strlen( name ); if( *p == 'U' || *p == 'V' ) p++; char *nextvar = strstr( p, "INT" ); for( i = 0; i < length && (p = strpbrk( p, " \t\n," )) && (p = strpbrk( p, "0123456789" )); i++ ) { int coef = -1; sscanf( p, "%d", &coef ); if( i == 0 && coef == 0 ) { memcpy( cqm, jvt, length ); return 0; } if( coef < 1 || coef > 255 ) { x264_log( h, X264_LOG_ERROR, "bad coefficient in list '%s'\n", name ); return -1; } cqm[i] = coef; } if( (nextvar && p > nextvar) || i != length ) { x264_log( h, X264_LOG_ERROR, "not enough coefficients in list '%s'\n", name ); return -1; } return 0; } int x264_cqm_parse_file( x264_t *h, const char *filename ) { char *p; int b_error = 0; h->param.i_cqm_preset = X264_CQM_CUSTOM; char *buf = x264_slurp_file( filename ); if( !buf ) { x264_log( h, X264_LOG_ERROR, "can't open file '%s'\n", filename ); return -1; } while( (p = strchr( buf, '#' )) != NULL ) memset( p, ' ', strcspn( p, "\n" ) ); b_error |= cqm_parse_jmlist( h, buf, "INTRA4X4_LUMA", h->param.cqm_4iy, x264_cqm_jvt4i, 16 ); b_error |= cqm_parse_jmlist( h, buf, "INTER4X4_LUMA", h->param.cqm_4py, x264_cqm_jvt4p, 16 ); b_error |= cqm_parse_jmlist( h, buf, "INTRA4X4_CHROMA", h->param.cqm_4ic, x264_cqm_jvt4i, 16 ); b_error |= cqm_parse_jmlist( h, buf, "INTER4X4_CHROMA", h->param.cqm_4pc, x264_cqm_jvt4p, 16 ); b_error |= cqm_parse_jmlist( h, buf, "INTRA8X8_LUMA", h->param.cqm_8iy, x264_cqm_jvt8i, 64 ); b_error |= cqm_parse_jmlist( h, buf, "INTER8X8_LUMA", h->param.cqm_8py, x264_cqm_jvt8p, 64 ); if( CHROMA444 ) { b_error |= cqm_parse_jmlist( h, buf, "INTRA8X8_CHROMA", h->param.cqm_8ic, x264_cqm_jvt8i, 64 ); b_error |= cqm_parse_jmlist( h, buf, "INTER8X8_CHROMA", h->param.cqm_8pc, x264_cqm_jvt8p, 64 ); } x264_free( buf ); return b_error; } x264-master/common/set.h000066400000000000000000000111511502133446700153120ustar00rootroot00000000000000/***************************************************************************** * set.h: quantization init ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_SET_H #define X264_SET_H enum cqm4_e { CQM_4IY = 0, CQM_4PY = 1, CQM_4IC = 2, CQM_4PC = 3 }; enum cqm8_e { CQM_8IY = 0, CQM_8PY = 1, CQM_8IC = 2, CQM_8PC = 3, }; typedef struct { int i_id; int i_profile_idc; int i_level_idc; int b_constraint_set0; int b_constraint_set1; int b_constraint_set2; int b_constraint_set3; int i_log2_max_frame_num; int i_poc_type; /* poc 0 */ int i_log2_max_poc_lsb; int i_num_ref_frames; int b_gaps_in_frame_num_value_allowed; int i_mb_width; int i_mb_height; int b_frame_mbs_only; int b_mb_adaptive_frame_field; int b_direct8x8_inference; int b_crop; struct { int i_left; int i_right; int i_top; int i_bottom; } crop; int b_vui; struct { int b_aspect_ratio_info_present; int i_sar_width; int i_sar_height; int b_overscan_info_present; int b_overscan_info; int b_signal_type_present; int i_vidformat; int b_fullrange; int b_color_description_present; int i_colorprim; int i_transfer; int i_colmatrix; int b_chroma_loc_info_present; int i_chroma_loc_top; int i_chroma_loc_bottom; int b_timing_info_present; uint32_t i_num_units_in_tick; uint32_t i_time_scale; int b_fixed_frame_rate; int b_nal_hrd_parameters_present; int b_vcl_hrd_parameters_present; struct { int i_cpb_cnt; int i_bit_rate_scale; int i_cpb_size_scale; int i_bit_rate_value; int i_cpb_size_value; int i_bit_rate_unscaled; int i_cpb_size_unscaled; int b_cbr_hrd; int i_initial_cpb_removal_delay_length; int i_cpb_removal_delay_length; int i_dpb_output_delay_length; int i_time_offset_length; } hrd; int b_pic_struct_present; int b_bitstream_restriction; int b_motion_vectors_over_pic_boundaries; int i_max_bytes_per_pic_denom; int i_max_bits_per_mb_denom; int i_log2_max_mv_length_horizontal; int i_log2_max_mv_length_vertical; int i_num_reorder_frames; int i_max_dec_frame_buffering; /* FIXME to complete */ } vui; int b_qpprime_y_zero_transform_bypass; int i_chroma_format_idc; int b_avcintra_hd; int b_avcintra_4k; int i_cqm_preset; const uint8_t *scaling_list[8]; /* could be 12, but we don't allow separate Cb/Cr lists */ } x264_sps_t; typedef struct { int i_id; int i_sps_id; int b_cabac; int b_pic_order; int i_num_slice_groups; int i_num_ref_idx_l0_default_active; int i_num_ref_idx_l1_default_active; int b_weighted_pred; int b_weighted_bipred; int i_pic_init_qp; int i_pic_init_qs; int i_chroma_qp_index_offset; int b_deblocking_filter_control; int b_constrained_intra_pred; int b_redundant_pic_cnt; int b_transform_8x8_mode; } x264_pps_t; #define x264_cqm_init x264_template(cqm_init) int x264_cqm_init( x264_t *h ); #define x264_cqm_delete x264_template(cqm_delete) void x264_cqm_delete( x264_t *h ); #define x264_cqm_parse_file x264_template(cqm_parse_file) int x264_cqm_parse_file( x264_t *h, const char *filename ); #endif x264-master/common/tables.c000066400000000000000000003414631502133446700160000ustar00rootroot00000000000000/***************************************************************************** * tables.c: const tables ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "base.h" const x264_level_t x264_levels[] = { { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 }, { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */ { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 }, { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 }, { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 }, { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 }, { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 }, { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 }, { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 }, { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 }, { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 }, { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 }, { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 }, { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, { 60, 4177920, 139264, 696320, 240000, 240000, 8192, 16, 24, 2, 1, 1, 1 }, { 61, 8355840, 139264, 696320, 480000, 480000, 8192, 16, 24, 2, 1, 1, 1 }, { 62, 16711680, 139264, 696320, 800000, 800000, 8192, 16, 24, 2, 1, 1, 1 }, { 0 } }; /***************************************************************************** * MATH *****************************************************************************/ const uint8_t x264_exp2_lut[64] = { 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102, 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170, 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250 }; const float x264_log2_lut[128] = { 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682, 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987, 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840, 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288, 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370, 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121, 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570, 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743, 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662, 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349, 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819, 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090, 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175, 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087, 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837, 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435, }; /* Avoid an int/float conversion. */ const float x264_log2_lz_lut[32] = { 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; /***************************************************************************** * ANALYSE *****************************************************************************/ /* lambda = pow(2,qp/6-2) */ const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = { 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */ 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */ 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */ 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */ 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */ 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */ 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */ 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */ 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */ 1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */ 2580,2896, /* 80-81 */ }; /* lambda2 = pow(lambda,2) * .9 * 256 */ /* Capped to avoid overflow */ const int x264_lambda2_tab[QP_MAX_MAX+1] = { 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */ 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */ 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */ 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */ 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */ 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */ 943718, 1189010, 1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */ 5992238, 7549747, 9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */ 38048341, 47937906, 60397977, 76096683, 95875813,120795955, /* 64-69 */ 134217727,134217727,134217727,134217727,134217727,134217727, /* 70-75 */ 134217727,134217727,134217727,134217727,134217727,134217727, /* 76-81 */ }; // should the intra and inter lambdas be different? // I'm just matching the behaviour of deadzone quant. const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = { // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS) { 46, 58, 73, 92, 117, 147, 185, 233, 294, 370, 466, 587, 740, 932, 1174, 1480, 1864, 2349, 2959, 3728, 4697, 5918, 7457, 9395, 11837, 14914, 18790, 23674, 29828, 37581, 47349, 59656, 75163, 94699, 119313, 150326, 189399, 238627, 300652, 378798, 477255, 601304, 757596, 954511, 1202608, 1515192, 1909022, 2405217, 3030384, 3818045, 4810435, 6060769, 7636091, 9620872, 12121539, 15272182, 19241743, 24243077, 30544363, 38483486, 48486154, 61088726, 76966972, 96972308, 122177453,134217727,134217727,134217727,134217727,134217727, 134217727,134217727,134217727,134217727,134217727,134217727, }, // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS) { 27, 34, 43, 54, 68, 86, 108, 136, 172, 216, 273, 343, 433, 545, 687, 865, 1090, 1374, 1731, 2180, 2747, 3461, 4361, 5494, 6922, 8721, 10988, 13844, 17442, 21976, 27688, 34885, 43953, 55377, 69771, 87906, 110755, 139543, 175813, 221511, 279087, 351627, 443023, 558174, 703255, 886046, 1116348, 1406511, 1772093, 2232697, 2813022, 3544186, 4465396, 5626046, 7088374, 8930791, 11252092, 14176748, 17861583, 22504184, 28353495, 35723165, 45008368, 56706990, 71446330, 90016736,113413980,134217727,134217727,134217727, 134217727,134217727,134217727,134217727,134217727,134217727, 134217727,134217727,134217727,134217727,134217727,134217727, } }; const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1] = { 16, 20, 25, 32, 40, 50, 64, 80, 101, 128, 161, 203, 256, 322, 406, 512, 645, 812, 1024, 1290, 1625, 2048, 2580, 3250, 4096, 5160, 6501, 8192, 10321, 13003, 16384, 20642, 26007, 32768, 41285, 52015, 65535 }; /***************************************************************************** * MC *****************************************************************************/ const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2}; /***************************************************************************** * CQM *****************************************************************************/ /* default quant matrices */ const uint8_t x264_cqm_jvt4i[16] = { 6,13,20,28, 13,20,28,32, 20,28,32,37, 28,32,37,42 }; const uint8_t x264_cqm_jvt4p[16] = { 10,14,20,24, 14,20,24,27, 20,24,27,30, 24,27,30,34 }; const uint8_t x264_cqm_jvt8i[64] = { 6,10,13,16,18,23,25,27, 10,11,16,18,23,25,27,29, 13,16,18,23,25,27,29,31, 16,18,23,25,27,29,31,33, 18,23,25,27,29,31,33,36, 23,25,27,29,31,33,36,38, 25,27,29,31,33,36,38,40, 27,29,31,33,36,38,40,42 }; const uint8_t x264_cqm_jvt8p[64] = { 9,13,15,17,19,21,22,24, 13,13,17,19,21,22,24,25, 15,17,19,21,22,24,25,27, 17,19,21,22,24,25,27,28, 19,21,22,24,25,27,28,30, 21,22,24,25,27,28,30,32, 22,24,25,27,28,30,32,33, 24,25,27,28,30,32,33,35 }; const uint8_t x264_cqm_flat16[64] = { 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16, 16,16,16,16,16,16,16,16 }; const uint8_t * const x264_cqm_jvt[8] = { x264_cqm_jvt4i, x264_cqm_jvt4p, x264_cqm_jvt4i, x264_cqm_jvt4p, x264_cqm_jvt8i, x264_cqm_jvt8p, x264_cqm_jvt8i, x264_cqm_jvt8p }; // 720p_avci50, 1080i_avci50, 1080p_avci50 const uint8_t x264_cqm_avci50_4ic[16] = { 16,22,28,40, 22,28,40,44, 28,40,44,48, 40,44,48,60 }; // 720p_avci50, 1080p_avci50 const uint8_t x264_cqm_avci50_p_8iy[64] = { 16,18,19,21,24,27,30,33, 18,19,21,24,27,30,33,78, 19,21,24,27,30,33,78,81, 21,24,27,30,33,78,81,84, 24,27,30,33,78,81,84,87, 27,30,33,78,81,84,87,90, 30,33,78,81,84,87,90,93, 33,78,81,84,87,90,93,96 }; // 1080i_avci50 const uint8_t x264_cqm_avci50_1080i_8iy[64] = { 16,18,19,21,27,33,81,87, 18,19,21,24,30,33,81,87, 19,21,24,27,30,78,84,90, 21,24,27,30,33,78,84,90, 24,27,30,33,78,81,84,90, 24,27,30,33,78,81,84,93, 27,30,33,78,78,81,87,93, 30,33,33,78,81,84,87,96 }; // 720p_avci100 const uint8_t x264_cqm_avci100_720p_4ic[16] = { 16,21,27,34, 21,27,34,41, 27,34,41,46, 34,41,46,54 }; // 720p_avci100 const uint8_t x264_cqm_avci100_720p_8iy[64] = { 16,18,19,21,22,24,26,32, 18,19,19,21,22,24,26,32, 19,19,21,22,22,24,26,32, 21,21,22,22,23,24,26,34, 22,22,22,23,24,25,26,34, 24,24,24,24,25,26,34,36, 26,26,26,26,26,34,36,38, 32,32,32,34,34,36,38,42 }; // 1080i_avci100, 1080p_avci100 const uint8_t x264_cqm_avci100_1080_4ic[16] = { 16,20,26,32, 20,26,32,38, 26,32,38,44, 32,38,44,50 }; // 1080i_avci100 const uint8_t x264_cqm_avci100_1080i_8iy[64] = { 16,19,20,23,24,26,32,42, 18,19,22,24,26,32,36,42, 18,20,23,24,26,32,36,63, 19,20,23,26,32,36,42,63, 20,22,24,26,32,36,59,63, 22,23,24,26,32,36,59,68, 22,23,24,26,32,42,59,68, 22,23,24,26,36,42,59,72 }; // 1080p_avci100 const uint8_t x264_cqm_avci100_1080p_8iy[64] = { 16,18,19,20,22,23,24,26, 18,19,20,22,23,24,26,32, 19,20,22,23,24,26,32,36, 20,22,23,24,26,32,36,42, 22,23,24,26,32,36,42,59, 23,24,26,32,36,42,59,63, 24,26,32,36,42,59,63,68, 26,32,36,42,59,63,68,72 }; // 2160p_avci300 const uint8_t x264_cqm_avci300_2160p_4iy[16] = { 12,16,19,20, 16,19,20,24, 19,20,24,33, 20,24,33,39 }; // 2160p_avci300 const uint8_t x264_cqm_avci300_2160p_4ic[16] = { 28,39,56,67, 39,56,67,77, 56,67,77,104, 67,77,104,133 }; // 2160p_avci300 const uint8_t x264_cqm_avci300_2160p_8iy[64] = { 12,14,16,17,19,20,20,24, 14,16,17,19,20,20,24,30, 16,17,19,20,20,24,30,42, 17,19,20,20,24,30,42,56, 19,20,20,24,30,42,56,72, 20,20,24,30,42,56,72,76, 20,24,30,42,56,72,76,80, 24,30,42,56,72,76,80,84 }; /***************************************************************************** * QUANT *****************************************************************************/ const uint8_t x264_decimate_table4[16] = { 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 }; const uint8_t x264_decimate_table8[64] = { 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; /***************************************************************************** * DCT *****************************************************************************/ /* the inverse of the scaling factors introduced by 8x8 fdct */ /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */ #define W(i) (i==0 ? FIX8(1.0000) :\ i==1 ? FIX8(0.8859) :\ i==2 ? FIX8(1.6000) :\ i==3 ? FIX8(0.9415) :\ i==4 ? FIX8(1.2651) :\ i==5 ? FIX8(1.1910) :0) const uint32_t x264_dct8_weight_tab[64] = { W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1) }; #undef W #define W(i) (i==0 ? FIX8(1.76777) :\ i==1 ? FIX8(1.11803) :\ i==2 ? FIX8(0.70711) :0) const uint32_t x264_dct4_weight_tab[16] = { W(0), W(1), W(0), W(1), W(1), W(2), W(1), W(2), W(0), W(1), W(0), W(1), W(1), W(2), W(1), W(2) }; #undef W /* inverse squared */ #define W(i) (i==0 ? FIX8(3.125) :\ i==1 ? FIX8(1.25) :\ i==2 ? FIX8(0.5) :0) const uint32_t x264_dct4_weight2_tab[16] = { W(0), W(1), W(0), W(1), W(1), W(2), W(1), W(2), W(0), W(1), W(0), W(1), W(1), W(2), W(1), W(2) }; #undef W #define W(i) (i==0 ? FIX8(1.00000) :\ i==1 ? FIX8(0.78487) :\ i==2 ? FIX8(2.56132) :\ i==3 ? FIX8(0.88637) :\ i==4 ? FIX8(1.60040) :\ i==5 ? FIX8(1.41850) :0) const uint32_t x264_dct8_weight2_tab[64] = { W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1), W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5), W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1) }; #undef W /***************************************************************************** * CABAC *****************************************************************************/ const int8_t x264_cabac_context_init_I[1024][2] = { /* 0 - 10 */ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, { 2, 54 }, { 3, 74 }, { -28,127 }, { -23, 104 }, { -6, 53 }, { -1, 54 }, { 7, 51 }, /* 11 - 23 unused for I */ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, /* 24- 39 */ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, /* 40 - 53 */ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, /* 54 - 59 */ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, /* 60 - 69 */ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, { 13, 41 }, { 3, 62 }, /* 70 -> 87 */ { 0, 11 }, { 1, 55 }, { 0, 69 }, { -17, 127 }, { -13, 102 },{ 0, 82 }, { -7, 74 }, { -21, 107 }, { -27, 127 },{ -31, 127 },{ -24, 127 }, { -18, 95 }, { -27, 127 },{ -21, 114 },{ -30, 127 }, { -17, 123 }, { -12, 115 },{ -16, 122 }, /* 88 -> 104 */ { -11, 115 },{ -12, 63 }, { -2, 68 }, { -15, 84 }, { -13, 104 },{ -3, 70 }, { -8, 93 }, { -10, 90 }, { -30, 127 },{ -1, 74 }, { -6, 97 }, { -7, 91 }, { -20, 127 },{ -4, 56 }, { -5, 82 }, { -7, 76 }, { -22, 125 }, /* 105 -> 135 */ { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, { 14, 62 }, { -13, 108 },{ -15, 100 }, /* 136 -> 165 */ { -13, 101 },{ -13, 91 }, { -12, 94 }, { -10, 88 }, { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, { 18, 59 }, { -8, 102 }, { -15, 100 }, { 0, 95 }, { -4, 75 }, { 2, 72 }, { -11, 75 }, { -3, 71 }, { 15, 46 }, { -13, 69 }, { 0, 62 }, { 0, 65 }, { 21, 37 }, { -15, 72 }, { 9, 57 }, { 16, 54 }, { 0, 62 }, { 12, 72 }, /* 166 -> 196 */ { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, { 0, 89 }, { 26, -19 }, { 22, -17 }, /* 197 -> 226 */ { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, { 41, 17 }, { 30, -6 }, { 27, 3 }, { 26, 22 }, { 37, -16 }, { 35, -4 }, { 38, -8 }, { 38, -3 }, { 37, 3 }, { 38, 5 }, { 42, 0 }, { 35, 16 }, { 39, 22 }, { 14, 48 }, { 27, 37 }, { 21, 60 }, { 12, 68 }, { 2, 97 }, /* 227 -> 251 */ { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, { -4, 65 }, /* 252 -> 275 */ { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, { -17, 110 },{ -11, 97 }, { -20, 84 }, { -11, 79 }, { -6, 73 }, { -4, 74 }, { -13, 86 }, { -13, 96 }, { -11, 97 }, { -19, 117 },{ -8, 78 }, { -5, 33 }, { -4, 48 }, { -2, 53 }, { -3, 62 }, { -13, 71 }, { -10, 79 }, { -12, 86 }, { -13, 90 }, { -14, 97 }, /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */ { 0, 0 }, /* 277 -> 307 */ { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, { 9, 64 }, { -12, 104 },{ -11, 97 }, /* 308 -> 337 */ { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, { -1, 83 }, { -7, 99 }, { -14, 95 }, { 2, 95 }, { 0, 76 }, { -5, 74 }, { 0, 70 }, { -11, 75 }, { 1, 68 }, { 0, 65 }, { -14, 73 }, { 3, 62 }, { 4, 62 }, { -1, 68 }, { -13, 75 }, { 11, 55 }, { 5, 64 }, { 12, 70 }, /* 338 -> 368 */ { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, { -12, 109 },{ 36, -35 }, { 36, -34 }, /* 369 -> 398 */ { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, { 13, 58 }, { 29, -3 }, { 26, 0 }, { 22, 30 }, { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, { 29, 39 }, { 19, 66 }, /* 399 -> 435 */ { 31, 21 }, { 31, 31 }, { 25, 50 }, { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, { 0, 68 }, { -9, 92 }, /* 436 -> 459 */ { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 }, /* 460 -> 1024 */ { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 }, { -12, 63 }, { -2, 68 }, { -15, 84 }, { -13, 104 }, { -3, 70 }, { -8, 93 }, { -10, 90 }, { -30, 127 }, { -17, 123 }, { -12, 115 }, { -16, 122 }, { -11, 115 }, { -12, 63 }, { -2, 68 }, { -15, 84 }, { -13, 104 }, { -3, 70 }, { -8, 93 }, { -10, 90 }, { -30, 127 }, { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, { 14, 62 }, { -13, 108 }, { -15, 100 }, { -13, 101 }, { -13, 91 }, { -12, 94 }, { -10, 88 }, { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, { 18, 59 }, { -7, 93 }, { -11, 87 }, { -3, 77 }, { -5, 71 }, { -4, 63 }, { -4, 68 }, { -12, 84 }, { -7, 62 }, { -7, 65 }, { 8, 61 }, { 5, 56 }, { -2, 66 }, { 1, 64 }, { 0, 61 }, { -2, 78 }, { 1, 50 }, { 7, 52 }, { 10, 35 }, { 0, 44 }, { 11, 38 }, { 1, 45 }, { 0, 46 }, { 5, 44 }, { 31, 17 }, { 1, 51 }, { 7, 50 }, { 28, 19 }, { 16, 33 }, { 14, 62 }, { -13, 108 }, { -15, 100 }, { -13, 101 }, { -13, 91 }, { -12, 94 }, { -10, 88 }, { -16, 84 }, { -10, 86 }, { -7, 83 }, { -13, 87 }, { -19, 94 }, { 1, 70 }, { 0, 72 }, { -5, 74 }, { 18, 59 }, { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, { 0, 89 }, { 26, -19 }, { 22, -17 }, { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, { 41, 17 }, { 24, 0 }, { 15, 9 }, { 8, 25 }, { 13, 18 }, { 15, 9 }, { 13, 19 }, { 10, 37 }, { 12, 18 }, { 6, 29 }, { 20, 33 }, { 15, 30 }, { 4, 45 }, { 1, 58 }, { 0, 62 }, { 7, 61 }, { 12, 38 }, { 11, 45 }, { 15, 39 }, { 11, 42 }, { 13, 44 }, { 16, 45 }, { 12, 41 }, { 10, 49 }, { 30, 34 }, { 18, 42 }, { 10, 55 }, { 17, 51 }, { 17, 46 }, { 0, 89 }, { 26, -19 }, { 22, -17 }, { 26, -17 }, { 30, -25 }, { 28, -20 }, { 33, -23 }, { 37, -27 }, { 33, -23 }, { 40, -28 }, { 38, -17 }, { 33, -11 }, { 40, -15 }, { 41, -6 }, { 38, 1 }, { 41, 17 }, { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, { -23, 68 }, { -24, 50 }, { -11, 74 }, { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, { -9, 64 }, { -5, 58 }, { 2, 59 }, { 23, -13 }, { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, { 21, -10 }, { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 }, { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, { 0, 68 }, { -9, 92 }, { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, { -23, 68 }, { -24, 50 }, { -11, 74 }, { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, { -9, 64 }, { -5, 58 }, { 2, 59 }, { 23, -13 }, { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, { 21, -10 }, { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 }, { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, { 0, 68 }, { -9, 92 }, { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, { 9, 64 }, { -12, 104 }, { -11, 97 }, { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, { -1, 83 }, { -6, 93 }, { -6, 84 }, { -8, 79 }, { 0, 66 }, { -1, 71 }, { 0, 62 }, { -2, 60 }, { -2, 59 }, { -5, 75 }, { -3, 62 }, { -4, 58 }, { -9, 66 }, { -1, 79 }, { 0, 71 }, { 3, 68 }, { 10, 44 }, { -7, 62 }, { 15, 36 }, { 14, 40 }, { 16, 27 }, { 12, 29 }, { 1, 44 }, { 20, 36 }, { 18, 32 }, { 5, 42 }, { 1, 48 }, { 10, 62 }, { 17, 46 }, { 9, 64 }, { -12, 104 }, { -11, 97 }, { -16, 96 }, { -7, 88 }, { -8, 85 }, { -7, 85 }, { -9, 85 }, { -13, 88 }, { 4, 66 }, { -3, 77 }, { -3, 76 }, { -6, 76 }, { 10, 58 }, { -1, 76 }, { -1, 83 }, { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, { -12, 109 }, { 36, -35 }, { 36, -34 }, { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, { 13, 58 }, { 15, 6 }, { 6, 19 }, { 7, 16 }, { 12, 14 }, { 18, 13 }, { 13, 11 }, { 13, 15 }, { 15, 16 }, { 12, 23 }, { 13, 23 }, { 15, 20 }, { 14, 26 }, { 14, 44 }, { 17, 40 }, { 17, 47 }, { 24, 17 }, { 21, 21 }, { 25, 22 }, { 31, 27 }, { 22, 29 }, { 19, 35 }, { 14, 50 }, { 10, 57 }, { 7, 63 }, { -2, 77 }, { -4, 82 }, { -3, 94 }, { 9, 69 }, { -12, 109 }, { 36, -35 }, { 36, -34 }, { 32, -26 }, { 37, -30 }, { 44, -32 }, { 34, -18 }, { 34, -15 }, { 40, -15 }, { 33, -7 }, { 35, -5 }, { 33, 0 }, { 38, 2 }, { 33, 13 }, { 23, 35 }, { 13, 58 }, { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, { -4, 65 }, { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, { -17, 110 }, { -3, 71 }, { -6, 42 }, { -5, 50 }, { -3, 54 }, { -2, 62 }, { 0, 58 }, { 1, 63 }, { -2, 72 }, { -1, 74 }, { -9, 91 }, { -5, 67 }, { -5, 27 }, { -3, 39 }, { -2, 44 }, { 0, 46 }, { -16, 64 }, { -8, 68 }, { -10, 78 }, { -6, 77 }, { -10, 86 }, { -12, 92 }, { -15, 55 }, { -10, 60 }, { -6, 62 }, { -4, 65 }, { -12, 73 }, { -8, 76 }, { -7, 80 }, { -9, 88 }, { -17, 110 }, { -3, 70 }, { -8, 93 }, { -10, 90 }, { -30, 127 }, { -3, 70 }, { -8, 93 }, { -10, 90 }, { -30, 127 }, { -3, 70 }, { -8, 93 }, { -10, 90 }, { -30, 127 } }; const int8_t x264_cabac_context_init_PB[3][1024][2] = { /* i_cabac_init_idc == 0 */ { /* 0 - 10 */ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, { -6, 53 }, { -1, 54 }, { 7, 51 }, /* 11 - 23 */ { 23, 33 }, { 23, 2 }, { 21, 0 }, { 1, 9 }, { 0, 49 }, { -37, 118 }, { 5, 57 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, { 12, 49 }, { -4, 73 }, { 17, 50 }, /* 24 - 39 */ { 18, 64 }, { 9, 43 }, { 29, 0 }, { 26, 67 }, { 16, 90 }, { 9, 104 }, { -46, 127 }, { -20, 104 }, { 1, 67 }, { -13, 78 }, { -11, 65 }, { 1, 62 }, { -6, 86 }, { -17, 95 }, { -6, 61 }, { 9, 45 }, /* 40 - 53 */ { -3, 69 }, { -6, 81 }, { -11, 96 }, { 6, 55 }, { 7, 67 }, { -5, 86 }, { 2, 88 }, { 0, 58 }, { -3, 76 }, { -10, 94 }, { 5, 54 }, { 4, 69 }, { -3, 81 }, { 0, 88 }, /* 54 - 59 */ { -7, 67 }, { -5, 74 }, { -4, 74 }, { -5, 80 }, { -7, 72 }, { 1, 58 }, /* 60 - 69 */ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, { 13, 41 }, { 3, 62 }, /* 70 - 87 */ { 0, 45 }, { -4, 78 }, { -3, 96 }, { -27, 126 }, { -28, 98 }, { -25, 101 }, { -23, 67 }, { -28, 82 }, { -20, 94 }, { -16, 83 }, { -22, 110 }, { -21, 91 }, { -18, 102 }, { -13, 93 }, { -29, 127 }, { -7, 92 }, { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, { -9, 92 }, { -8, 87 }, { -23, 126 }, { 5, 54 }, { 6, 60 }, { 6, 59 }, { 6, 69 }, { -1, 48 }, { 0, 68 }, { -4, 69 }, { -8, 88 }, /* 105 -> 165 */ { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, { 3, 64 }, { 1, 61 }, { 9, 63 }, { 7, 50 }, { 16, 39 }, { 5, 44 }, { 4, 52 }, { 11, 48 }, { -5, 60 }, { -1, 59 }, { 0, 59 }, { 22, 33 }, { 5, 44 }, { 14, 43 }, { -1, 78 }, { 0, 60 }, { 9, 69 }, /* 166 - 226 */ { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, { 1, 67 }, { 5, 59 }, { 9, 67 }, { 16, 30 }, { 18, 32 }, { 18, 35 }, { 22, 29 }, { 24, 31 }, { 23, 38 }, { 18, 43 }, { 20, 41 }, { 11, 63 }, { 9, 59 }, { 9, 64 }, { -1, 94 }, { -2, 89 }, { -9, 108 }, /* 227 - 275 */ { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, { -3, 74 }, { -10, 90 }, { 0, 70 }, { -4, 29 }, { 5, 31 }, { 7, 42 }, { 1, 59 }, { -2, 58 }, { -3, 72 }, { -3, 81 }, { -11, 97 }, { 0, 58 }, { 8, 5 }, { 10, 14 }, { 14, 18 }, { 13, 27 }, { 2, 40 }, { 0, 58 }, { -3, 70 }, { -6, 79 }, { -8, 85 }, /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */ { 0, 0 }, /* 277 - 337 */ { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, { -2, 69 }, { -2, 59 }, { 6, 70 }, { 10, 44 }, { 9, 31 }, { 12, 43 }, { 3, 53 }, { 14, 34 }, { 10, 38 }, { -3, 52 }, { 13, 40 }, { 17, 32 }, { 7, 44 }, { 7, 38 }, { 13, 50 }, { 10, 57 }, { 26, 43 }, /* 338 - 398 */ { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, { 8, 60 }, { 6, 63 }, { 17, 65 }, { 21, 24 }, { 23, 20 }, { 26, 23 }, { 27, 32 }, { 28, 23 }, { 28, 24 }, { 23, 40 }, { 24, 32 }, { 28, 29 }, { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, { 11, 86 }, /* 399 -> 435 */ { 12, 40 }, { 11, 51 }, { 14, 59 }, { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, { -8, 66 }, { -8, 76 }, /* 436 -> 459 */ { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, /* 460 - 1024 */ { -7, 92 }, { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, { -9, 92 }, { -8, 87 }, { -23, 126 }, { -7, 92 }, { -5, 89 }, { -7, 96 }, { -13, 108 }, { -3, 46 }, { -1, 65 }, { -1, 57 }, { -9, 93 }, { -3, 74 }, { -9, 92 }, { -8, 87 }, { -23, 126 }, { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, { -2, 85 }, { -6, 78 }, { -1, 75 }, { -7, 77 }, { 2, 54 }, { 5, 50 }, { -3, 68 }, { 1, 50 }, { 6, 42 }, { -4, 81 }, { 1, 63 }, { -4, 70 }, { 0, 67 }, { 2, 57 }, { -2, 76 }, { 11, 35 }, { 4, 64 }, { 1, 61 }, { 11, 35 }, { 18, 25 }, { 12, 24 }, { 13, 29 }, { 13, 36 }, { -10, 93 }, { -7, 73 }, { -2, 73 }, { 13, 46 }, { 9, 49 }, { -7, 100 }, { 9, 53 }, { 2, 53 }, { 5, 53 }, { -2, 61 }, { 0, 56 }, { 0, 56 }, { -13, 63 }, { -5, 60 }, { -1, 62 }, { 4, 57 }, { -6, 69 }, { 4, 57 }, { 14, 39 }, { 4, 51 }, { 13, 68 }, { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, { 11, 28 }, { 2, 40 }, { 3, 44 }, { 0, 49 }, { 0, 46 }, { 2, 44 }, { 2, 51 }, { 0, 47 }, { 4, 39 }, { 2, 62 }, { 6, 46 }, { 0, 54 }, { 3, 54 }, { 2, 58 }, { 4, 63 }, { 6, 51 }, { 6, 57 }, { 7, 53 }, { 6, 52 }, { 6, 55 }, { 11, 45 }, { 14, 36 }, { 8, 53 }, { -1, 82 }, { 7, 55 }, { -3, 78 }, { 15, 46 }, { 22, 31 }, { -1, 84 }, { 25, 7 }, { 30, -7 }, { 28, 3 }, { 28, 4 }, { 32, 0 }, { 34, -1 }, { 30, 6 }, { 30, 6 }, { 32, 9 }, { 31, 19 }, { 26, 27 }, { 26, 30 }, { 37, 20 }, { 28, 34 }, { 17, 70 }, { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, { -16, 66 }, { -22, 65 }, { -20, 63 }, { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, { -14, 66 }, { 0, 59 }, { 2, 59 }, { 9, -2 }, { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, { 21, -13 }, { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, { -8, 66 }, { -8, 76 }, { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, { -16, 66 }, { -22, 65 }, { -20, 63 }, { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, { -14, 66 }, { 0, 59 }, { 2, 59 }, { 9, -2 }, { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, { 21, -13 }, { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, { -8, 66 }, { -8, 76 }, { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, { -13, 106 }, { -16, 106 }, { -10, 87 }, { -21, 114 }, { -18, 110 }, { -14, 98 }, { -22, 110 }, { -21, 106 }, { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 }, { -10, 96 }, { -12, 95 }, { -5, 91 }, { -9, 93 }, { -22, 94 }, { -5, 86 }, { 9, 67 }, { -4, 80 }, { -10, 85 }, { -1, 70 }, { 7, 60 }, { 9, 58 }, { 5, 61 }, { 12, 50 }, { 15, 50 }, { 18, 49 }, { 17, 54 }, { 10, 41 }, { 7, 46 }, { -1, 51 }, { 7, 49 }, { 8, 52 }, { 9, 41 }, { 6, 47 }, { 2, 55 }, { 13, 41 }, { 10, 44 }, { 6, 50 }, { 5, 53 }, { 13, 49 }, { 4, 63 }, { 6, 64 }, { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, { 14, 11 }, { 11, 14 }, { 9, 11 }, { 18, 11 }, { 21, 9 }, { 23, -2 }, { 32, -15 }, { 32, -15 }, { 34, -21 }, { 39, -23 }, { 42, -33 }, { 41, -31 }, { 46, -28 }, { 38, -12 }, { 21, 29 }, { 45, -24 }, { 53, -45 }, { 48, -26 }, { 65, -43 }, { 43, -19 }, { 39, -10 }, { 30, 9 }, { 18, 26 }, { 20, 27 }, { 0, 57 }, { -14, 82 }, { -5, 75 }, { -19, 97 }, { -35, 125 }, { 27, 0 }, { 28, 0 }, { 31, -4 }, { 27, 6 }, { 34, 8 }, { 30, 10 }, { 24, 22 }, { 33, 19 }, { 22, 32 }, { 26, 31 }, { 21, 41 }, { 26, 44 }, { 23, 47 }, { 16, 65 }, { 14, 71 }, { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, { -3, 74 }, { -10, 90 }, { -6, 76 }, { -2, 44 }, { 0, 45 }, { 0, 52 }, { -3, 64 }, { -2, 59 }, { -4, 70 }, { -4, 75 }, { -8, 82 }, { -17, 102 }, { -9, 77 }, { 3, 24 }, { 0, 42 }, { 0, 48 }, { 0, 55 }, { -6, 59 }, { -7, 71 }, { -12, 83 }, { -11, 87 }, { -30, 119 }, { 1, 58 }, { -3, 29 }, { -1, 36 }, { 1, 38 }, { 2, 43 }, { -6, 55 }, { 0, 58 }, { 0, 64 }, { -3, 74 }, { -10, 90 }, { -3, 74 }, { -9, 92 }, { -8, 87 }, { -23, 126 }, { -3, 74 }, { -9, 92 }, { -8, 87 }, { -23, 126 }, { -3, 74 }, { -9, 92 }, { -8, 87 }, { -23, 126 } }, /* i_cabac_init_idc == 1 */ { /* 0 - 10 */ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, { -6, 53 }, { -1, 54 }, { 7, 51 }, /* 11 - 23 */ { 22, 25 }, { 34, 0 }, { 16, 0 }, { -2, 9 }, { 4, 41 }, { -29, 118 }, { 2, 65 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, { 9, 50 }, { -3, 70 }, { 10, 54 }, /* 24 - 39 */ { 26, 34 }, { 19, 22 }, { 40, 0 }, { 57, 2 }, { 41, 36 }, { 26, 69 }, { -45, 127 }, { -15, 101 }, { -4, 76 }, { -6, 71 }, { -13, 79 }, { 5, 52 }, { 6, 69 }, { -13, 90 }, { 0, 52 }, { 8, 43 }, /* 40 - 53 */ { -2, 69 },{ -5, 82 },{ -10, 96 },{ 2, 59 }, { 2, 75 },{ -3, 87 },{ -3, 100 },{ 1, 56 }, { -3, 74 },{ -6, 85 },{ 0, 59 },{ -3, 81 }, { -7, 86 },{ -5, 95 }, /* 54 - 59 */ { -1, 66 },{ -1, 77 },{ 1, 70 },{ -2, 86 }, { -5, 72 },{ 0, 61 }, /* 60 - 69 */ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, { 13, 41 }, { 3, 62 }, /* 70 - 104 */ { 13, 15 }, { 7, 51 }, { 2, 80 }, { -39, 127 }, { -18, 91 }, { -17, 96 }, { -26, 81 }, { -35, 98 }, { -24, 102 }, { -23, 97 }, { -27, 119 }, { -24, 99 }, { -21, 110 }, { -18, 102 }, { -36, 127 }, { 0, 80 }, { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, { -12, 104 }, { -9, 91 }, { -31, 127 }, { 3, 55 }, { 7, 56 }, { 7, 55 }, { 8, 61 }, { -3, 53 }, { 0, 68 }, { -7, 74 }, { -9, 88 }, /* 105 -> 165 */ { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, { -4, 71 }, { 0, 58 }, { 7, 61 }, { 9, 41 }, { 18, 25 }, { 9, 32 }, { 5, 43 }, { 9, 47 }, { 0, 44 }, { 0, 51 }, { 2, 46 }, { 19, 38 }, { -4, 66 }, { 15, 38 }, { 12, 42 }, { 9, 34 }, { 0, 89 }, /* 166 - 226 */ { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, { 0, 75 }, { 2, 72 }, { 8, 77 }, { 14, 35 }, { 18, 31 }, { 17, 35 }, { 21, 30 }, { 17, 45 }, { 20, 42 }, { 18, 45 }, { 27, 26 }, { 16, 54 }, { 7, 66 }, { 16, 56 }, { 11, 73 }, { 10, 67 }, { -10, 116 }, /* 227 - 275 */ { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, { -5, 74 }, { -9, 86 }, { 2, 66 }, { -9, 34 }, { 1, 32 }, { 11, 31 }, { 5, 52 }, { -2, 55 }, { -2, 67 }, { 0, 73 }, { -8, 89 }, { 3, 52 }, { 7, 4 }, { 10, 8 }, { 17, 8 }, { 16, 19 }, { 3, 37 }, { -1, 61 }, { -5, 73 }, { -1, 70 }, { -4, 78 }, /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */ { 0, 0 }, /* 277 - 337 */ { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, { -1, 70 }, { -9, 72 }, { 14, 60 }, { 16, 37 }, { 0, 47 }, { 18, 35 }, { 11, 37 }, { 12, 41 }, { 10, 41 }, { 2, 48 }, { 12, 41 }, { 13, 41 }, { 0, 59 }, { 3, 50 }, { 19, 40 }, { 3, 66 }, { 18, 50 }, /* 338 - 398 */ { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, { 12, 48 }, { 11, 49 }, { 26, 45 }, { 22, 22 }, { 23, 22 }, { 27, 21 }, { 33, 20 }, { 26, 28 }, { 30, 24 }, { 27, 34 }, { 18, 42 }, { 25, 39 }, { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, { 11, 83 }, /* 399 -> 435 */ { 25, 32 }, { 21, 49 }, { 21, 54 }, { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, { -4, 67 }, { -7, 82 }, /* 436 -> 459 */ { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, /* 460 - 1024 */ { 0, 80 }, { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, { -12, 104 }, { -9, 91 }, { -31, 127 }, { 0, 80 }, { -5, 89 }, { -7, 94 }, { -4, 92 }, { 0, 39 }, { 0, 65 }, { -15, 84 }, { -35, 127 }, { -2, 73 }, { -12, 104 }, { -9, 91 }, { -31, 127 }, { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, { -13, 103 }, { -13, 91 }, { -9, 89 }, { -14, 92 }, { -8, 76 }, { -12, 87 }, { -23, 110 }, { -24, 105 }, { -10, 78 }, { -20, 112 }, { -17, 99 }, { -78, 127 }, { -70, 127 }, { -50, 127 }, { -46, 127 }, { -4, 66 }, { -5, 78 }, { -4, 71 }, { -8, 72 }, { 2, 59 }, { -1, 55 }, { -7, 70 }, { -6, 75 }, { -8, 89 }, { -34, 119 }, { -3, 75 }, { 32, 20 }, { 30, 22 }, { -44, 127 }, { 0, 54 }, { -5, 61 }, { 0, 58 }, { -1, 60 }, { -3, 61 }, { -8, 67 }, { -25, 84 }, { -14, 74 }, { -5, 65 }, { 5, 52 }, { 2, 57 }, { 0, 61 }, { -9, 69 }, { -11, 70 }, { 18, 55 }, { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, { 4, 45 }, { 10, 28 }, { 10, 31 }, { 33, -11 }, { 52, -43 }, { 18, 15 }, { 28, 0 }, { 35, -22 }, { 38, -25 }, { 34, 0 }, { 39, -18 }, { 32, -12 }, { 102, -94 }, { 0, 0 }, { 56, -15 }, { 33, -4 }, { 29, 10 }, { 37, -5 }, { 51, -29 }, { 39, -9 }, { 52, -34 }, { 69, -58 }, { 67, -63 }, { 44, -5 }, { 32, 7 }, { 55, -29 }, { 32, 1 }, { 0, 0 }, { 27, 36 }, { 33, -25 }, { 34, -30 }, { 36, -28 }, { 38, -28 }, { 38, -27 }, { 34, -18 }, { 35, -16 }, { 34, -14 }, { 32, -8 }, { 37, -6 }, { 35, 0 }, { 30, 10 }, { 28, 18 }, { 26, 25 }, { 29, 41 }, { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, { -14, 66 }, { 0, 59 }, { 2, 59 }, { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, { 17, -10 }, { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, { -4, 67 }, { -7, 82 }, { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, { -14, 66 }, { 0, 59 }, { 2, 59 }, { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, { 17, -10 }, { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, { -2, 52 }, { -9, 57 }, { -6, 63 }, { -4, 65 }, { -4, 67 }, { -7, 82 }, { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 }, { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 }, { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 }, { -10, 95 }, { -14, 100 }, { -8, 95 }, { -17, 111 }, { -28, 114 }, { -6, 89 }, { -2, 80 }, { -4, 82 }, { -9, 85 }, { -8, 81 }, { -1, 72 }, { 5, 64 }, { 1, 67 }, { 9, 56 }, { 0, 69 }, { 1, 69 }, { 7, 69 }, { -7, 69 }, { -6, 67 }, { -16, 77 }, { -2, 64 }, { 2, 61 }, { -6, 67 }, { -3, 64 }, { 2, 57 }, { -3, 65 }, { -3, 66 }, { 0, 62 }, { 9, 51 }, { -1, 66 }, { -2, 71 }, { -2, 75 }, { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, { 19, -6 }, { 18, -6 }, { 14, 0 }, { 26, -12 }, { 31, -16 }, { 33, -25 }, { 33, -22 }, { 37, -28 }, { 39, -30 }, { 42, -30 }, { 47, -42 }, { 45, -36 }, { 49, -34 }, { 41, -17 }, { 32, 9 }, { 69, -71 }, { 63, -63 }, { 66, -64 }, { 77, -74 }, { 54, -39 }, { 52, -35 }, { 41, -10 }, { 36, 0 }, { 40, -1 }, { 30, 14 }, { 28, 26 }, { 23, 37 }, { 12, 55 }, { 11, 65 }, { 37, -33 }, { 39, -36 }, { 40, -37 }, { 38, -30 }, { 46, -33 }, { 42, -30 }, { 40, -24 }, { 49, -29 }, { 38, -12 }, { 40, -10 }, { 38, -3 }, { 46, -5 }, { 31, 20 }, { 29, 30 }, { 25, 44 }, { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, { -5, 74 }, { -9, 86 }, { -23, 112 }, { -15, 71 }, { -7, 61 }, { 0, 53 }, { -5, 66 }, { -11, 77 }, { -9, 80 }, { -9, 84 }, { -10, 87 }, { -34, 127 }, { -21, 101 }, { -3, 39 }, { -5, 53 }, { -7, 61 }, { -11, 75 }, { -15, 77 }, { -17, 91 }, { -25, 107 }, { -25, 111 }, { -28, 122 }, { -11, 76 }, { -10, 44 }, { -10, 52 }, { -10, 57 }, { -9, 58 }, { -16, 72 }, { -7, 69 }, { -4, 69 }, { -5, 74 }, { -9, 86 }, { -2, 73 }, { -12, 104 }, { -9, 91 }, { -31, 127 }, { -2, 73 }, { -12, 104 }, { -9, 91 }, { -31, 127 }, { -2, 73 }, { -12, 104 }, { -9, 91 }, { -31, 127 } }, /* i_cabac_init_idc == 2 */ { /* 0 - 10 */ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, { 2, 54 }, { 3, 74 }, { -28, 127 }, { -23, 104 }, { -6, 53 }, { -1, 54 }, { 7, 51 }, /* 11 - 23 */ { 29, 16 }, { 25, 0 }, { 14, 0 }, { -10, 51 }, { -3, 62 }, { -27, 99 }, { 26, 16 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, { 6, 57 }, { -17, 73 }, { 14, 57 }, /* 24 - 39 */ { 20, 40 }, { 20, 10 }, { 29, 0 }, { 54, 0 }, { 37, 42 }, { 12, 97 }, { -32, 127 }, { -22, 117 }, { -2, 74 }, { -4, 85 }, { -24, 102 }, { 5, 57 }, { -6, 93 }, { -14, 88 }, { -6, 44 }, { 4, 55 }, /* 40 - 53 */ { -11, 89 },{ -15, 103 },{ -21, 116 },{ 19, 57 }, { 20, 58 },{ 4, 84 },{ 6, 96 },{ 1, 63 }, { -5, 85 },{ -13, 106 },{ 5, 63 },{ 6, 75 }, { -3, 90 },{ -1, 101 }, /* 54 - 59 */ { 3, 55 },{ -4, 79 },{ -2, 75 },{ -12, 97 }, { -7, 50 },{ 1, 60 }, /* 60 - 69 */ { 0, 41 }, { 0, 63 }, { 0, 63 }, { 0, 63 }, { -9, 83 }, { 4, 86 }, { 0, 97 }, { -7, 72 }, { 13, 41 }, { 3, 62 }, /* 70 - 104 */ { 7, 34 }, { -9, 88 }, { -20, 127 }, { -36, 127 }, { -17, 91 }, { -14, 95 }, { -25, 84 }, { -25, 86 }, { -12, 89 }, { -17, 91 }, { -31, 127 }, { -14, 76 }, { -18, 103 }, { -13, 90 }, { -37, 127 }, { 11, 80 }, { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, { -11, 104 }, { -11, 91 }, { -30, 127 }, { 0, 65 }, { -2, 79 }, { 0, 72 }, { -4, 92 }, { -6, 56 }, { 3, 68 }, { -8, 71 }, { -13, 98 }, /* 105 -> 165 */ { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, { 3, 65 }, { -7, 69 }, { 8, 77 }, { -10, 66 }, { 3, 62 }, { -3, 68 }, { -20, 81 }, { 0, 30 }, { 1, 7 }, { -3, 23 }, { -21, 74 }, { 16, 66 }, { -23, 124 }, { 17, 37 }, { 44, -18 }, { 50, -34 }, { -22, 127 }, /* 166 - 226 */ { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, { 20, 34 }, { 19, 31 }, { 27, 44 }, { 19, 16 }, { 15, 36 }, { 15, 36 }, { 21, 28 }, { 25, 21 }, { 30, 20 }, { 31, 12 }, { 27, 16 }, { 24, 42 }, { 0, 93 }, { 14, 56 }, { 15, 57 }, { 26, 38 }, { -24, 127 }, /* 227 - 275 */ { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, { -12, 92 }, { -18, 108 }, { -4, 79 }, { -22, 69 }, { -16, 75 }, { -2, 58 }, { 1, 58 }, { -13, 78 }, { -9, 83 }, { -4, 81 }, { -13, 99 }, { -13, 81 }, { -6, 38 }, { -13, 62 }, { -6, 58 }, { -2, 59 }, { -16, 73 }, { -10, 76 }, { -13, 86 }, { -9, 83 }, { -10, 87 }, /* 276 a bit special (not used, x264_cabac_encode_bypass is used instead) */ { 0, 0 }, /* 277 - 337 */ { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, { -2, 76 }, { -18, 86 }, { 12, 70 }, { 5, 64 }, { -12, 70 }, { 11, 55 }, { 5, 56 }, { 0, 69 }, { 2, 65 }, { -6, 74 }, { 5, 54 }, { 7, 54 }, { -6, 76 }, { -11, 82 }, { -2, 77 }, { -2, 77 }, { 25, 42 }, /* 338 - 398 */ { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, { 18, 31 }, { 19, 26 }, { 36, 24 }, { 24, 23 }, { 27, 16 }, { 24, 30 }, { 31, 29 }, { 22, 41 }, { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, { 25, 61 }, /* 399 -> 435 */ { 21, 33 }, { 19, 50 }, { 17, 61 }, { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, { -6, 68 }, { -10, 79 }, /* 436 -> 459 */ { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, /* 460 - 1024 */ { 11, 80 }, { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, { -11, 104 }, { -11, 91 }, { -30, 127 }, { 11, 80 }, { 5, 76 }, { 2, 84 }, { 5, 78 }, { -6, 55 }, { 4, 61 }, { -14, 83 }, { -37, 127 }, { -5, 79 }, { -11, 104 }, { -11, 91 }, { -30, 127 }, { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, { -4, 86 }, { -12, 88 }, { -5, 82 }, { -3, 72 }, { -4, 67 }, { -8, 72 }, { -16, 89 }, { -9, 69 }, { -1, 59 }, { 5, 66 }, { 4, 57 }, { -4, 71 }, { -2, 71 }, { 2, 58 }, { -1, 74 }, { -4, 44 }, { -1, 69 }, { 0, 62 }, { -7, 51 }, { -4, 47 }, { -6, 42 }, { -3, 41 }, { -6, 53 }, { 8, 76 }, { -9, 78 }, { -11, 83 }, { 9, 52 }, { 0, 67 }, { -5, 90 }, { 1, 67 }, { -15, 72 }, { -5, 75 }, { -8, 80 }, { -21, 83 }, { -21, 64 }, { -13, 31 }, { -25, 64 }, { -29, 94 }, { 9, 75 }, { 17, 63 }, { -8, 74 }, { -5, 35 }, { -2, 27 }, { 13, 91 }, { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, { 4, 39 }, { 0, 42 }, { 7, 34 }, { 11, 29 }, { 8, 31 }, { 6, 37 }, { 7, 42 }, { 3, 40 }, { 8, 33 }, { 13, 43 }, { 13, 36 }, { 4, 47 }, { 3, 55 }, { 2, 58 }, { 6, 60 }, { 8, 44 }, { 11, 44 }, { 14, 42 }, { 7, 48 }, { 4, 56 }, { 4, 52 }, { 13, 37 }, { 9, 49 }, { 19, 58 }, { 10, 48 }, { 12, 45 }, { 0, 69 }, { 20, 33 }, { 8, 63 }, { 35, -18 }, { 33, -25 }, { 28, -3 }, { 24, 10 }, { 27, 0 }, { 34, -14 }, { 52, -44 }, { 39, -24 }, { 19, 17 }, { 31, 25 }, { 36, 29 }, { 24, 33 }, { 34, 15 }, { 30, 20 }, { 22, 73 }, { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, { -14, 59 }, { -9, 52 }, { -11, 68 }, { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, { 9, -2 }, { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, { -6, 68 }, { -10, 79 }, { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, { -14, 59 }, { -9, 52 }, { -11, 68 }, { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, { 9, -2 }, { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, { -6, 68 }, { -10, 79 }, { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 }, { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 }, { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 }, { -10, 94 }, { -15, 102 }, { -10, 99 }, { -13, 106 }, { -50, 127 }, { -5, 92 }, { 17, 57 }, { -5, 86 }, { -13, 94 }, { -12, 91 }, { -2, 77 }, { 0, 71 }, { -1, 73 }, { 4, 64 }, { -7, 81 }, { 5, 64 }, { 15, 57 }, { 1, 67 }, { 0, 68 }, { -10, 67 }, { 1, 68 }, { 0, 77 }, { 2, 64 }, { 0, 68 }, { -5, 78 }, { 7, 55 }, { 5, 59 }, { 2, 65 }, { 14, 54 }, { 15, 44 }, { 5, 60 }, { 2, 70 }, { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, { 17, -13 }, { 16, -9 }, { 17, -12 }, { 27, -21 }, { 37, -30 }, { 41, -40 }, { 42, -41 }, { 48, -47 }, { 39, -32 }, { 46, -40 }, { 52, -51 }, { 46, -41 }, { 52, -39 }, { 43, -19 }, { 32, 11 }, { 61, -55 }, { 56, -46 }, { 62, -50 }, { 81, -67 }, { 45, -20 }, { 35, -2 }, { 28, 15 }, { 34, 1 }, { 39, 1 }, { 30, 17 }, { 20, 38 }, { 18, 45 }, { 15, 54 }, { 0, 79 }, { 36, -16 }, { 37, -14 }, { 37, -17 }, { 32, 1 }, { 34, 15 }, { 29, 15 }, { 24, 25 }, { 34, 22 }, { 31, 16 }, { 35, 18 }, { 31, 28 }, { 33, 41 }, { 36, 28 }, { 27, 47 }, { 21, 62 }, { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, { -12, 92 }, { -18, 108 }, { -24, 115 }, { -22, 82 }, { -9, 62 }, { 0, 53 }, { 0, 59 }, { -14, 85 }, { -13, 89 }, { -13, 94 }, { -11, 92 }, { -29, 127 }, { -21, 100 }, { -14, 57 }, { -12, 67 }, { -11, 71 }, { -10, 77 }, { -21, 85 }, { -16, 88 }, { -23, 104 }, { -15, 98 }, { -37, 127 }, { -10, 82 }, { -8, 48 }, { -8, 61 }, { -8, 66 }, { -7, 70 }, { -14, 75 }, { -10, 79 }, { -9, 83 }, { -12, 92 }, { -18, 108 }, { -5, 79 }, { -11, 104 }, { -11, 91 }, { -30, 127 }, { -5, 79 }, { -11, 104 }, { -11, 91 }, { -30, 127 }, { -5, 79 }, { -11, 104 }, { -11, 91 }, { -30, 127 } } }; const uint8_t x264_cabac_range_lps[64][4] = { { 2, 2, 2, 2}, { 6, 7, 8, 9}, { 6, 7, 9, 10}, { 6, 8, 9, 11}, { 7, 8, 10, 11}, { 7, 9, 10, 12}, { 7, 9, 11, 12}, { 8, 9, 11, 13}, { 8, 10, 12, 14}, { 9, 11, 12, 14}, { 9, 11, 13, 15}, { 10, 12, 14, 16}, { 10, 12, 15, 17}, { 11, 13, 15, 18}, { 11, 14, 16, 19}, { 12, 14, 17, 20}, { 12, 15, 18, 21}, { 13, 16, 19, 22}, { 14, 17, 20, 23}, { 14, 18, 21, 24}, { 15, 19, 22, 25}, { 16, 20, 23, 27}, { 17, 21, 25, 28}, { 18, 22, 26, 30}, { 19, 23, 27, 31}, { 20, 24, 29, 33}, { 21, 26, 30, 35}, { 22, 27, 32, 37}, { 23, 28, 33, 39}, { 24, 30, 35, 41}, { 26, 31, 37, 43}, { 27, 33, 39, 45}, { 29, 35, 41, 48}, { 30, 37, 43, 50}, { 32, 39, 46, 53}, { 33, 41, 48, 56}, { 35, 43, 51, 59}, { 37, 45, 54, 62}, { 39, 48, 56, 65}, { 41, 50, 59, 69}, { 43, 53, 63, 72}, { 46, 56, 66, 76}, { 48, 59, 69, 80}, { 51, 62, 73, 85}, { 53, 65, 77, 89}, { 56, 69, 81, 94}, { 59, 72, 86, 99}, { 62, 76, 90, 104}, { 66, 80, 95, 110}, { 69, 85, 100, 116}, { 73, 89, 105, 122}, { 77, 94, 111, 128}, { 81, 99, 117, 135}, { 85, 104, 123, 142}, { 90, 110, 130, 150}, { 95, 116, 137, 158}, {100, 122, 144, 166}, {105, 128, 152, 175}, {111, 135, 160, 185}, {116, 142, 169, 195}, {123, 150, 178, 205}, {128, 158, 187, 216}, {128, 167, 197, 227}, {128, 176, 208, 240} }; const uint8_t x264_cabac_transition[128][2] = { { 0, 0}, { 1, 1}, { 2, 50}, { 51, 3}, { 2, 50}, { 51, 3}, { 4, 52}, { 53, 5}, { 6, 52}, { 53, 7}, { 8, 52}, { 53, 9}, { 10, 54}, { 55, 11}, { 12, 54}, { 55, 13}, { 14, 54}, { 55, 15}, { 16, 56}, { 57, 17}, { 18, 56}, { 57, 19}, { 20, 56}, { 57, 21}, { 22, 58}, { 59, 23}, { 24, 58}, { 59, 25}, { 26, 60}, { 61, 27}, { 28, 60}, { 61, 29}, { 30, 60}, { 61, 31}, { 32, 62}, { 63, 33}, { 34, 62}, { 63, 35}, { 36, 64}, { 65, 37}, { 38, 66}, { 67, 39}, { 40, 66}, { 67, 41}, { 42, 66}, { 67, 43}, { 44, 68}, { 69, 45}, { 46, 68}, { 69, 47}, { 48, 70}, { 71, 49}, { 50, 72}, { 73, 51}, { 52, 72}, { 73, 53}, { 54, 74}, { 75, 55}, { 56, 74}, { 75, 57}, { 58, 76}, { 77, 59}, { 60, 78}, { 79, 61}, { 62, 78}, { 79, 63}, { 64, 80}, { 81, 65}, { 66, 82}, { 83, 67}, { 68, 82}, { 83, 69}, { 70, 84}, { 85, 71}, { 72, 84}, { 85, 73}, { 74, 88}, { 89, 75}, { 76, 88}, { 89, 77}, { 78, 90}, { 91, 79}, { 80, 90}, { 91, 81}, { 82, 94}, { 95, 83}, { 84, 94}, { 95, 85}, { 86, 96}, { 97, 87}, { 88, 96}, { 97, 89}, { 90, 100}, {101, 91}, { 92, 100}, {101, 93}, { 94, 102}, {103, 95}, { 96, 104}, {105, 97}, { 98, 104}, {105, 99}, {100, 108}, {109, 101}, {102, 108}, {109, 103}, {104, 110}, {111, 105}, {106, 112}, {113, 107}, {108, 114}, {115, 109}, {110, 116}, {117, 111}, {112, 118}, {119, 113}, {114, 118}, {119, 115}, {116, 122}, {123, 117}, {118, 122}, {123, 119}, {120, 124}, {125, 121}, {122, 126}, {127, 123}, {124, 127}, {126, 125} }; const uint8_t x264_cabac_renorm_shift[64] = { 6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; /* -ln2(probability) */ const uint16_t x264_cabac_entropy[128] = { FIX8(0.0273), FIX8(5.7370), FIX8(0.0288), FIX8(5.6618), FIX8(0.0303), FIX8(5.5866), FIX8(0.0320), FIX8(5.5114), FIX8(0.0337), FIX8(5.4362), FIX8(0.0355), FIX8(5.3610), FIX8(0.0375), FIX8(5.2859), FIX8(0.0395), FIX8(5.2106), FIX8(0.0416), FIX8(5.1354), FIX8(0.0439), FIX8(5.0602), FIX8(0.0463), FIX8(4.9851), FIX8(0.0488), FIX8(4.9099), FIX8(0.0515), FIX8(4.8347), FIX8(0.0543), FIX8(4.7595), FIX8(0.0572), FIX8(4.6843), FIX8(0.0604), FIX8(4.6091), FIX8(0.0637), FIX8(4.5339), FIX8(0.0671), FIX8(4.4588), FIX8(0.0708), FIX8(4.3836), FIX8(0.0747), FIX8(4.3083), FIX8(0.0788), FIX8(4.2332), FIX8(0.0832), FIX8(4.1580), FIX8(0.0878), FIX8(4.0828), FIX8(0.0926), FIX8(4.0076), FIX8(0.0977), FIX8(3.9324), FIX8(0.1032), FIX8(3.8572), FIX8(0.1089), FIX8(3.7820), FIX8(0.1149), FIX8(3.7068), FIX8(0.1214), FIX8(3.6316), FIX8(0.1282), FIX8(3.5565), FIX8(0.1353), FIX8(3.4813), FIX8(0.1429), FIX8(3.4061), FIX8(0.1510), FIX8(3.3309), FIX8(0.1596), FIX8(3.2557), FIX8(0.1686), FIX8(3.1805), FIX8(0.1782), FIX8(3.1053), FIX8(0.1884), FIX8(3.0301), FIX8(0.1992), FIX8(2.9549), FIX8(0.2107), FIX8(2.8797), FIX8(0.2229), FIX8(2.8046), FIX8(0.2358), FIX8(2.7294), FIX8(0.2496), FIX8(2.6542), FIX8(0.2642), FIX8(2.5790), FIX8(0.2798), FIX8(2.5038), FIX8(0.2964), FIX8(2.4286), FIX8(0.3142), FIX8(2.3534), FIX8(0.3331), FIX8(2.2782), FIX8(0.3532), FIX8(2.2030), FIX8(0.3748), FIX8(2.1278), FIX8(0.3979), FIX8(2.0527), FIX8(0.4226), FIX8(1.9775), FIX8(0.4491), FIX8(1.9023), FIX8(0.4776), FIX8(1.8271), FIX8(0.5082), FIX8(1.7519), FIX8(0.5412), FIX8(1.6767), FIX8(0.5768), FIX8(1.6015), FIX8(0.6152), FIX8(1.5263), FIX8(0.6568), FIX8(1.4511), FIX8(0.7020), FIX8(1.3759), FIX8(0.7513), FIX8(1.3008), FIX8(0.8050), FIX8(1.2256), FIX8(0.8638), FIX8(1.1504), FIX8(0.9285), FIX8(1.0752), FIX8(1.0000), FIX8(1.0000) }; /***************************************************************************** * RDO *****************************************************************************/ /* Padded to [64] for easier addressing */ const uint8_t x264_significant_coeff_flag_offset_8x8[2][64] = {{ 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },{ 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5, 6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9, 9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }}; const uint8_t x264_last_coeff_flag_offset_8x8[63] = { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; const uint8_t x264_coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */ const uint16_t x264_significant_coeff_flag_offset[2][16] = { { 105+0, 105+15, 105+29, 105+44, 105+47, 402, 484+0, 484+15, 484+29, 660, 528+0, 528+15, 528+29, 718, 0, 0 }, { 277+0, 277+15, 277+29, 277+44, 277+47, 436, 776+0, 776+15, 776+29, 675, 820+0, 820+15, 820+29, 733, 0, 0 } }; const uint16_t x264_last_coeff_flag_offset[2][16] = { { 166+0, 166+15, 166+29, 166+44, 166+47, 417, 572+0, 572+15, 572+29, 690, 616+0, 616+15, 616+29, 748, 0, 0 }, { 338+0, 338+15, 338+29, 338+44, 338+47, 451, 864+0, 864+15, 864+29, 699, 908+0, 908+15, 908+29, 757, 0, 0 } }; const uint16_t x264_coeff_abs_level_m1_offset[16] = { 227+0, 227+10, 227+20, 227+30, 227+39, 426, 952+0, 952+10, 952+20, 708, 982+0, 982+10, 982+20, 766 }; const uint8_t x264_count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; /***************************************************************************** * VLC *****************************************************************************/ /* [nC] */ const vlc_t x264_coeff0_token[6] = { { 0x1, 1 }, /* str=1 */ { 0x3, 2 }, /* str=11 */ { 0xf, 4 }, /* str=1111 */ { 0x3, 6 }, /* str=000011 */ { 0x1, 2 }, /* str=01 */ { 0x1, 1 }, /* str=1 */ }; /* [nC][i_total_coeff-1][i_trailing] */ const vlc_t x264_coeff_token[6][16][4] = { { /* table 0 */ { /* i_total 1 */ { 0x5, 6 }, /* str=000101 */ { 0x1, 2 }, /* str=01 */ }, { /* i_total 2 */ { 0x7, 8 }, /* str=00000111 */ { 0x4, 6 }, /* str=000100 */ { 0x1, 3 }, /* str=001 */ }, { /* i_total 3 */ { 0x7, 9 }, /* str=000000111 */ { 0x6, 8 }, /* str=00000110 */ { 0x5, 7 }, /* str=0000101 */ { 0x3, 5 }, /* str=00011 */ }, { /* i_total 4 */ { 0x7, 10 }, /* str=0000000111 */ { 0x6, 9 }, /* str=000000110 */ { 0x5, 8 }, /* str=00000101 */ { 0x3, 6 }, /* str=000011 */ }, { /* i_total 5 */ { 0x7, 11 }, /* str=00000000111 */ { 0x6, 10 }, /* str=0000000110 */ { 0x5, 9 }, /* str=000000101 */ { 0x4, 7 }, /* str=0000100 */ }, { /* i_total 6 */ { 0xf, 13 }, /* str=0000000001111 */ { 0x6, 11 }, /* str=00000000110 */ { 0x5, 10 }, /* str=0000000101 */ { 0x4, 8 }, /* str=00000100 */ }, { /* i_total 7 */ { 0xb, 13 }, /* str=0000000001011 */ { 0xe, 13 }, /* str=0000000001110 */ { 0x5, 11 }, /* str=00000000101 */ { 0x4, 9 }, /* str=000000100 */ }, { /* i_total 8 */ { 0x8, 13 }, /* str=0000000001000 */ { 0xa, 13 }, /* str=0000000001010 */ { 0xd, 13 }, /* str=0000000001101 */ { 0x4, 10 }, /* str=0000000100 */ }, { /* i_total 9 */ { 0xf, 14 }, /* str=00000000001111 */ { 0xe, 14 }, /* str=00000000001110 */ { 0x9, 13 }, /* str=0000000001001 */ { 0x4, 11 }, /* str=00000000100 */ }, { /* i_total 10 */ { 0xb, 14 }, /* str=00000000001011 */ { 0xa, 14 }, /* str=00000000001010 */ { 0xd, 14 }, /* str=00000000001101 */ { 0xc, 13 }, /* str=0000000001100 */ }, { /* i_total 14 */ { 0xf, 15 }, /* str=000000000001111 */ { 0xe, 15 }, /* str=000000000001110 */ { 0x9, 14 }, /* str=00000000001001 */ { 0xc, 14 }, /* str=00000000001100 */ }, { /* i_total 12 */ { 0xb, 15 }, /* str=000000000001011 */ { 0xa, 15 }, /* str=000000000001010 */ { 0xd, 15 }, /* str=000000000001101 */ { 0x8, 14 }, /* str=00000000001000 */ }, { /* i_total 13 */ { 0xf, 16 }, /* str=0000000000001111 */ { 0x1, 15 }, /* str=000000000000001 */ { 0x9, 15 }, /* str=000000000001001 */ { 0xc, 15 }, /* str=000000000001100 */ }, { /* i_total 14 */ { 0xb, 16 }, /* str=0000000000001011 */ { 0xe, 16 }, /* str=0000000000001110 */ { 0xd, 16 }, /* str=0000000000001101 */ { 0x8, 15 }, /* str=000000000001000 */ }, { /* i_total 15 */ { 0x7, 16 }, /* str=0000000000000111 */ { 0xa, 16 }, /* str=0000000000001010 */ { 0x9, 16 }, /* str=0000000000001001 */ { 0xc, 16 }, /* str=0000000000001100 */ }, { /* i_total 16 */ { 0x4, 16 }, /* str=0000000000000100 */ { 0x6, 16 }, /* str=0000000000000110 */ { 0x5, 16 }, /* str=0000000000000101 */ { 0x8, 16 }, /* str=0000000000001000 */ }, }, { /* table 1 */ { /* i_total 1 */ { 0xb, 6 }, /* str=001011 */ { 0x2, 2 }, /* str=10 */ }, { /* i_total 2 */ { 0x7, 6 }, /* str=000111 */ { 0x7, 5 }, /* str=00111 */ { 0x3, 3 }, /* str=011 */ }, { /* i_total 3 */ { 0x7, 7 }, /* str=0000111 */ { 0xa, 6 }, /* str=001010 */ { 0x9, 6 }, /* str=001001 */ { 0x5, 4 }, /* str=0101 */ }, { /* i_total 4 */ { 0x7, 8 }, /* str=00000111 */ { 0x6, 6 }, /* str=000110 */ { 0x5, 6 }, /* str=000101 */ { 0x4, 4 }, /* str=0100 */ }, { /* i_total 5 */ { 0x4, 8 }, /* str=00000100 */ { 0x6, 7 }, /* str=0000110 */ { 0x5, 7 }, /* str=0000101 */ { 0x6, 5 }, /* str=00110 */ }, { /* i_total 6 */ { 0x7, 9 }, /* str=000000111 */ { 0x6, 8 }, /* str=00000110 */ { 0x5, 8 }, /* str=00000101 */ { 0x8, 6 }, /* str=001000 */ }, { /* i_total 7 */ { 0xf, 11 }, /* str=00000001111 */ { 0x6, 9 }, /* str=000000110 */ { 0x5, 9 }, /* str=000000101 */ { 0x4, 6 }, /* str=000100 */ }, { /* i_total 8 */ { 0xb, 11 }, /* str=00000001011 */ { 0xe, 11 }, /* str=00000001110 */ { 0xd, 11 }, /* str=00000001101 */ { 0x4, 7 }, /* str=0000100 */ }, { /* i_total 9 */ { 0xf, 12 }, /* str=000000001111 */ { 0xa, 11 }, /* str=00000001010 */ { 0x9, 11 }, /* str=00000001001 */ { 0x4, 9 }, /* str=000000100 */ }, { /* i_total 10 */ { 0xb, 12 }, /* str=000000001011 */ { 0xe, 12 }, /* str=000000001110 */ { 0xd, 12 }, /* str=000000001101 */ { 0xc, 11 }, /* str=00000001100 */ }, { /* i_total 11 */ { 0x8, 12 }, /* str=000000001000 */ { 0xa, 12 }, /* str=000000001010 */ { 0x9, 12 }, /* str=000000001001 */ { 0x8, 11 }, /* str=00000001000 */ }, { /* i_total 12 */ { 0xf, 13 }, /* str=0000000001111 */ { 0xe, 13 }, /* str=0000000001110 */ { 0xd, 13 }, /* str=0000000001101 */ { 0xc, 12 }, /* str=000000001100 */ }, { /* i_total 13 */ { 0xb, 13 }, /* str=0000000001011 */ { 0xa, 13 }, /* str=0000000001010 */ { 0x9, 13 }, /* str=0000000001001 */ { 0xc, 13 }, /* str=0000000001100 */ }, { /* i_total 14 */ { 0x7, 13 }, /* str=0000000000111 */ { 0xb, 14 }, /* str=00000000001011 */ { 0x6, 13 }, /* str=0000000000110 */ { 0x8, 13 }, /* str=0000000001000 */ }, { /* i_total 15 */ { 0x9, 14 }, /* str=00000000001001 */ { 0x8, 14 }, /* str=00000000001000 */ { 0xa, 14 }, /* str=00000000001010 */ { 0x1, 13 }, /* str=0000000000001 */ }, { /* i_total 16 */ { 0x7, 14 }, /* str=00000000000111 */ { 0x6, 14 }, /* str=00000000000110 */ { 0x5, 14 }, /* str=00000000000101 */ { 0x4, 14 }, /* str=00000000000100 */ }, }, { /* table 2 */ { /* i_total 1 */ { 0xf, 6 }, /* str=001111 */ { 0xe, 4 }, /* str=1110 */ }, { /* i_total 2 */ { 0xb, 6 }, /* str=001011 */ { 0xf, 5 }, /* str=01111 */ { 0xd, 4 }, /* str=1101 */ }, { /* i_total 3 */ { 0x8, 6 }, /* str=001000 */ { 0xc, 5 }, /* str=01100 */ { 0xe, 5 }, /* str=01110 */ { 0xc, 4 }, /* str=1100 */ }, { /* i_total 4 */ { 0xf, 7 }, /* str=0001111 */ { 0xa, 5 }, /* str=01010 */ { 0xb, 5 }, /* str=01011 */ { 0xb, 4 }, /* str=1011 */ }, { /* i_total 5 */ { 0xb, 7 }, /* str=0001011 */ { 0x8, 5 }, /* str=01000 */ { 0x9, 5 }, /* str=01001 */ { 0xa, 4 }, /* str=1010 */ }, { /* i_total 6 */ { 0x9, 7 }, /* str=0001001 */ { 0xe, 6 }, /* str=001110 */ { 0xd, 6 }, /* str=001101 */ { 0x9, 4 }, /* str=1001 */ }, { /* i_total 7 */ { 0x8, 7 }, /* str=0001000 */ { 0xa, 6 }, /* str=001010 */ { 0x9, 6 }, /* str=001001 */ { 0x8, 4 }, /* str=1000 */ }, { /* i_total 8 */ { 0xf, 8 }, /* str=00001111 */ { 0xe, 7 }, /* str=0001110 */ { 0xd, 7 }, /* str=0001101 */ { 0xd, 5 }, /* str=01101 */ }, { /* i_total 9 */ { 0xb, 8 }, /* str=00001011 */ { 0xe, 8 }, /* str=00001110 */ { 0xa, 7 }, /* str=0001010 */ { 0xc, 6 }, /* str=001100 */ }, { /* i_total 10 */ { 0xf, 9 }, /* str=000001111 */ { 0xa, 8 }, /* str=00001010 */ { 0xd, 8 }, /* str=00001101 */ { 0xc, 7 }, /* str=0001100 */ }, { /* i_total 11 */ { 0xb, 9 }, /* str=000001011 */ { 0xe, 9 }, /* str=000001110 */ { 0x9, 8 }, /* str=00001001 */ { 0xc, 8 }, /* str=00001100 */ }, { /* i_total 12 */ { 0x8, 9 }, /* str=000001000 */ { 0xa, 9 }, /* str=000001010 */ { 0xd, 9 }, /* str=000001101 */ { 0x8, 8 }, /* str=00001000 */ }, { /* i_total 13 */ { 0xd, 10 }, /* str=0000001101 */ { 0x7, 9 }, /* str=000000111 */ { 0x9, 9 }, /* str=000001001 */ { 0xc, 9 }, /* str=000001100 */ }, { /* i_total 14 */ { 0x9, 10 }, /* str=0000001001 */ { 0xc, 10 }, /* str=0000001100 */ { 0xb, 10 }, /* str=0000001011 */ { 0xa, 10 }, /* str=0000001010 */ }, { /* i_total 15 */ { 0x5, 10 }, /* str=0000000101 */ { 0x8, 10 }, /* str=0000001000 */ { 0x7, 10 }, /* str=0000000111 */ { 0x6, 10 }, /* str=0000000110 */ }, { /* i_total 16 */ { 0x1, 10 }, /* str=0000000001 */ { 0x4, 10 }, /* str=0000000100 */ { 0x3, 10 }, /* str=0000000011 */ { 0x2, 10 }, /* str=0000000010 */ }, }, { /* table 3 */ { /* i_total 1 */ { 0x0, 6 }, /* str=000000 */ { 0x1, 6 }, /* str=000001 */ }, { /* i_total 2 */ { 0x4, 6 }, /* str=000100 */ { 0x5, 6 }, /* str=000101 */ { 0x6, 6 }, /* str=000110 */ }, { /* i_total 3 */ { 0x8, 6 }, /* str=001000 */ { 0x9, 6 }, /* str=001001 */ { 0xa, 6 }, /* str=001010 */ { 0xb, 6 }, /* str=001011 */ }, { /* i_total 4 */ { 0xc, 6 }, /* str=001100 */ { 0xd, 6 }, /* str=001101 */ { 0xe, 6 }, /* str=001110 */ { 0xf, 6 }, /* str=001111 */ }, { /* i_total 5 */ { 0x10, 6 }, /* str=010000 */ { 0x11, 6 }, /* str=010001 */ { 0x12, 6 }, /* str=010010 */ { 0x13, 6 }, /* str=010011 */ }, { /* i_total 6 */ { 0x14, 6 }, /* str=010100 */ { 0x15, 6 }, /* str=010101 */ { 0x16, 6 }, /* str=010110 */ { 0x17, 6 }, /* str=010111 */ }, { /* i_total 7 */ { 0x18, 6 }, /* str=011000 */ { 0x19, 6 }, /* str=011001 */ { 0x1a, 6 }, /* str=011010 */ { 0x1b, 6 }, /* str=011011 */ }, { /* i_total 8 */ { 0x1c, 6 }, /* str=011100 */ { 0x1d, 6 }, /* str=011101 */ { 0x1e, 6 }, /* str=011110 */ { 0x1f, 6 }, /* str=011111 */ }, { /* i_total 9 */ { 0x20, 6 }, /* str=100000 */ { 0x21, 6 }, /* str=100001 */ { 0x22, 6 }, /* str=100010 */ { 0x23, 6 }, /* str=100011 */ }, { /* i_total 10 */ { 0x24, 6 }, /* str=100100 */ { 0x25, 6 }, /* str=100101 */ { 0x26, 6 }, /* str=100110 */ { 0x27, 6 }, /* str=100111 */ }, { /* i_total 11 */ { 0x28, 6 }, /* str=101000 */ { 0x29, 6 }, /* str=101001 */ { 0x2a, 6 }, /* str=101010 */ { 0x2b, 6 }, /* str=101011 */ }, { /* i_total 12 */ { 0x2c, 6 }, /* str=101100 */ { 0x2d, 6 }, /* str=101101 */ { 0x2e, 6 }, /* str=101110 */ { 0x2f, 6 }, /* str=101111 */ }, { /* i_total 13 */ { 0x30, 6 }, /* str=110000 */ { 0x31, 6 }, /* str=110001 */ { 0x32, 6 }, /* str=110010 */ { 0x33, 6 }, /* str=110011 */ }, { /* i_total 14 */ { 0x34, 6 }, /* str=110100 */ { 0x35, 6 }, /* str=110101 */ { 0x36, 6 }, /* str=110110 */ { 0x37, 6 }, /* str=110111 */ }, { /* i_total 15 */ { 0x38, 6 }, /* str=111000 */ { 0x39, 6 }, /* str=111001 */ { 0x3a, 6 }, /* str=111010 */ { 0x3b, 6 }, /* str=111011 */ }, { /* i_total 16 */ { 0x3c, 6 }, /* str=111100 */ { 0x3d, 6 }, /* str=111101 */ { 0x3e, 6 }, /* str=111110 */ { 0x3f, 6 }, /* str=111111 */ }, }, { /* table 4 */ { /* i_total 1 */ { 0x7, 6 }, /* str=000111 */ { 0x1, 1 }, /* str=1 */ }, { /* i_total 2 */ { 0x4, 6 }, /* str=000100 */ { 0x6, 6 }, /* str=000110 */ { 0x1, 3 }, /* str=001 */ }, { /* i_total 3 */ { 0x3, 6 }, /* str=000011 */ { 0x3, 7 }, /* str=0000011 */ { 0x2, 7 }, /* str=0000010 */ { 0x5, 6 }, /* str=000101 */ }, { /* i_total 4 */ { 0x2, 6 }, /* str=000010 */ { 0x3, 8 }, /* str=00000011 */ { 0x2, 8 }, /* str=00000010 */ { 0x0, 7 }, /* str=0000000 */ }, }, { /* table 5 */ { /* i_total 1 */ { 0xf, 7 }, /* str=0001111 */ { 0x1, 2 }, /* str=01 */ }, { /* i_total 2 */ { 0xe, 7 }, /* str=0001110 */ { 0xd, 7 }, /* str=0001101 */ { 0x1, 3 }, /* str=001 */ }, { /* i_total 3 */ { 0x7, 9 }, /* str=000000111 */ { 0xc, 7 }, /* str=0001100 */ { 0xb, 7 }, /* str=0001011 */ { 0x1, 5 }, /* str=00001 */ }, { /* i_total 4 */ { 0x6, 9 }, /* str=000000110 */ { 0x5, 9 }, /* str=000000101 */ { 0xa, 7 }, /* str=0001010 */ { 0x1, 6 }, /* str=000001 */ }, { /* i_total 5 */ { 0x7, 10 }, /* str=0000000111 */ { 0x6, 10 }, /* str=0000000110 */ { 0x4, 9 }, /* str=000000100 */ { 0x9, 7 }, /* str=0001001 */ }, { /* i_total 6 */ { 0x7, 11 }, /* str=00000000111 */ { 0x6, 11 }, /* str=00000000110 */ { 0x5, 10 }, /* str=0000000101 */ { 0x8, 7 }, /* str=0001000 */ }, { /* i_total 7 */ { 0x7, 12 }, /* str=000000000111 */ { 0x6, 12 }, /* str=000000000110 */ { 0x5, 11 }, /* str=00000000101 */ { 0x4, 10 }, /* str=0000000100 */ }, { /* i_total 8 */ { 0x7, 13 }, /* str=0000000000111 */ { 0x5, 12 }, /* str=000000000101 */ { 0x4, 12 }, /* str=000000000100 */ { 0x4, 11 }, /* str=00000000100 */ }, }, }; /* [i_total_coeff-1][i_total_zeros] */ const vlc_t x264_total_zeros[15][16] = { { /* i_total 1 */ { 0x1, 1 }, /* str=1 */ { 0x3, 3 }, /* str=011 */ { 0x2, 3 }, /* str=010 */ { 0x3, 4 }, /* str=0011 */ { 0x2, 4 }, /* str=0010 */ { 0x3, 5 }, /* str=00011 */ { 0x2, 5 }, /* str=00010 */ { 0x3, 6 }, /* str=000011 */ { 0x2, 6 }, /* str=000010 */ { 0x3, 7 }, /* str=0000011 */ { 0x2, 7 }, /* str=0000010 */ { 0x3, 8 }, /* str=00000011 */ { 0x2, 8 }, /* str=00000010 */ { 0x3, 9 }, /* str=000000011 */ { 0x2, 9 }, /* str=000000010 */ { 0x1, 9 }, /* str=000000001 */ }, { /* i_total 2 */ { 0x7, 3 }, /* str=111 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ { 0x3, 3 }, /* str=011 */ { 0x5, 4 }, /* str=0101 */ { 0x4, 4 }, /* str=0100 */ { 0x3, 4 }, /* str=0011 */ { 0x2, 4 }, /* str=0010 */ { 0x3, 5 }, /* str=00011 */ { 0x2, 5 }, /* str=00010 */ { 0x3, 6 }, /* str=000011 */ { 0x2, 6 }, /* str=000010 */ { 0x1, 6 }, /* str=000001 */ { 0x0, 6 }, /* str=000000 */ }, { /* i_total 3 */ { 0x5, 4 }, /* str=0101 */ { 0x7, 3 }, /* str=111 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ { 0x4, 4 }, /* str=0100 */ { 0x3, 4 }, /* str=0011 */ { 0x4, 3 }, /* str=100 */ { 0x3, 3 }, /* str=011 */ { 0x2, 4 }, /* str=0010 */ { 0x3, 5 }, /* str=00011 */ { 0x2, 5 }, /* str=00010 */ { 0x1, 6 }, /* str=000001 */ { 0x1, 5 }, /* str=00001 */ { 0x0, 6 }, /* str=000000 */ }, { /* i_total 4 */ { 0x3, 5 }, /* str=00011 */ { 0x7, 3 }, /* str=111 */ { 0x5, 4 }, /* str=0101 */ { 0x4, 4 }, /* str=0100 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ { 0x3, 4 }, /* str=0011 */ { 0x3, 3 }, /* str=011 */ { 0x2, 4 }, /* str=0010 */ { 0x2, 5 }, /* str=00010 */ { 0x1, 5 }, /* str=00001 */ { 0x0, 5 }, /* str=00000 */ }, { /* i_total 5 */ { 0x5, 4 }, /* str=0101 */ { 0x4, 4 }, /* str=0100 */ { 0x3, 4 }, /* str=0011 */ { 0x7, 3 }, /* str=111 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ { 0x3, 3 }, /* str=011 */ { 0x2, 4 }, /* str=0010 */ { 0x1, 5 }, /* str=00001 */ { 0x1, 4 }, /* str=0001 */ { 0x0, 5 }, /* str=00000 */ }, { /* i_total 6 */ { 0x1, 6 }, /* str=000001 */ { 0x1, 5 }, /* str=00001 */ { 0x7, 3 }, /* str=111 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ { 0x3, 3 }, /* str=011 */ { 0x2, 3 }, /* str=010 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 3 }, /* str=001 */ { 0x0, 6 }, /* str=000000 */ }, { /* i_total 7 */ { 0x1, 6 }, /* str=000001 */ { 0x1, 5 }, /* str=00001 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ { 0x3, 3 }, /* str=011 */ { 0x3, 2 }, /* str=11 */ { 0x2, 3 }, /* str=010 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 3 }, /* str=001 */ { 0x0, 6 }, /* str=000000 */ }, { /* i_total 8 */ { 0x1, 6 }, /* str=000001 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 5 }, /* str=00001 */ { 0x3, 3 }, /* str=011 */ { 0x3, 2 }, /* str=11 */ { 0x2, 2 }, /* str=10 */ { 0x2, 3 }, /* str=010 */ { 0x1, 3 }, /* str=001 */ { 0x0, 6 }, /* str=000000 */ }, { /* i_total 9 */ { 0x1, 6 }, /* str=000001 */ { 0x0, 6 }, /* str=000000 */ { 0x1, 4 }, /* str=0001 */ { 0x3, 2 }, /* str=11 */ { 0x2, 2 }, /* str=10 */ { 0x1, 3 }, /* str=001 */ { 0x1, 2 }, /* str=01 */ { 0x1, 5 }, /* str=00001 */ }, { /* i_total 10 */ { 0x1, 5 }, /* str=00001 */ { 0x0, 5 }, /* str=00000 */ { 0x1, 3 }, /* str=001 */ { 0x3, 2 }, /* str=11 */ { 0x2, 2 }, /* str=10 */ { 0x1, 2 }, /* str=01 */ { 0x1, 4 }, /* str=0001 */ }, { /* i_total 11 */ { 0x0, 4 }, /* str=0000 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 3 }, /* str=001 */ { 0x2, 3 }, /* str=010 */ { 0x1, 1 }, /* str=1 */ { 0x3, 3 }, /* str=011 */ }, { /* i_total 12 */ { 0x0, 4 }, /* str=0000 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 2 }, /* str=01 */ { 0x1, 1 }, /* str=1 */ { 0x1, 3 }, /* str=001 */ }, { /* i_total 13 */ { 0x0, 3 }, /* str=000 */ { 0x1, 3 }, /* str=001 */ { 0x1, 1 }, /* str=1 */ { 0x1, 2 }, /* str=01 */ }, { /* i_total 14 */ { 0x0, 2 }, /* str=00 */ { 0x1, 2 }, /* str=01 */ { 0x1, 1 }, /* str=1 */ }, { /* i_total 15 */ { 0x0, 1 }, /* str=0 */ { 0x1, 1 }, /* str=1 */ }, }; /* [i_total_coeff-1][i_total_zeros] */ const vlc_t x264_total_zeros_2x2_dc[3][4] = { { /* i_total 1 */ { 0x1, 1 }, /* str=1 */ { 0x1, 2 }, /* str=01 */ { 0x1, 3 }, /* str=001 */ { 0x0, 3 } /* str=000 */ }, { /* i_total 2 */ { 0x1, 1 }, /* str=1 */ { 0x1, 2 }, /* str=01 */ { 0x0, 2 }, /* str=00 */ }, { /* i_total 3 */ { 0x1, 1 }, /* str=1 */ { 0x0, 1 }, /* str=0 */ }, }; /* [i_total_coeff-1][i_total_zeros] */ const vlc_t x264_total_zeros_2x4_dc[7][8] = { { /* i_total 1 */ { 0x1, 1 }, /* str=1 */ { 0x2, 3 }, /* str=010 */ { 0x3, 3 }, /* str=011 */ { 0x2, 4 }, /* str=0010 */ { 0x3, 4 }, /* str=0011 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 5 }, /* str=00001 */ { 0x0, 5 }, /* str=00000 */ }, { /* i_total 2 */ { 0x0, 3 }, /* str=000 */ { 0x1, 2 }, /* str=01 */ { 0x1, 3 }, /* str=001 */ { 0x4, 3 }, /* str=100 */ { 0x5, 3 }, /* str=101 */ { 0x6, 3 }, /* str=110 */ { 0x7, 3 }, /* str=111 */ }, { /* i_total 3 */ { 0x0, 3 }, /* str=000 */ { 0x1, 3 }, /* str=001 */ { 0x1, 2 }, /* str=01 */ { 0x2, 2 }, /* str=10 */ { 0x6, 3 }, /* str=110 */ { 0x7, 3 }, /* str=111 */ }, { /* i_total 4 */ { 0x6, 3 }, /* str=110 */ { 0x0, 2 }, /* str=00 */ { 0x1, 2 }, /* str=01 */ { 0x2, 2 }, /* str=10 */ { 0x7, 3 }, /* str=111 */ }, { /* i_total 5 */ { 0x0, 2 }, /* str=00 */ { 0x1, 2 }, /* str=01 */ { 0x2, 2 }, /* str=10 */ { 0x3, 2 }, /* str=11 */ }, { /* i_total 6 */ { 0x0, 2 }, /* str=00 */ { 0x1, 2 }, /* str=01 */ { 0x1, 1 }, /* str=1 */ }, { /* i_total 7 */ { 0x0, 1 }, /* str=0 */ { 0x1, 1 }, /* str=1 */ } }; /* [MIN( i_zero_left-1, 6 )][run_before] */ const vlc_t x264_run_before_init[7][16] = { { /* i_zero_left 1 */ { 0x1, 1 }, /* str=1 */ { 0x0, 1 }, /* str=0 */ }, { /* i_zero_left 2 */ { 0x1, 1 }, /* str=1 */ { 0x1, 2 }, /* str=01 */ { 0x0, 2 }, /* str=00 */ }, { /* i_zero_left 3 */ { 0x3, 2 }, /* str=11 */ { 0x2, 2 }, /* str=10 */ { 0x1, 2 }, /* str=01 */ { 0x0, 2 }, /* str=00 */ }, { /* i_zero_left 4 */ { 0x3, 2 }, /* str=11 */ { 0x2, 2 }, /* str=10 */ { 0x1, 2 }, /* str=01 */ { 0x1, 3 }, /* str=001 */ { 0x0, 3 }, /* str=000 */ }, { /* i_zero_left 5 */ { 0x3, 2 }, /* str=11 */ { 0x2, 2 }, /* str=10 */ { 0x3, 3 }, /* str=011 */ { 0x2, 3 }, /* str=010 */ { 0x1, 3 }, /* str=001 */ { 0x0, 3 }, /* str=000 */ }, { /* i_zero_left 6 */ { 0x3, 2 }, /* str=11 */ { 0x0, 3 }, /* str=000 */ { 0x1, 3 }, /* str=001 */ { 0x3, 3 }, /* str=011 */ { 0x2, 3 }, /* str=010 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ }, { /* i_zero_left >6 */ { 0x7, 3 }, /* str=111 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ { 0x3, 3 }, /* str=011 */ { 0x2, 3 }, /* str=010 */ { 0x1, 3 }, /* str=001 */ { 0x1, 4 }, /* str=0001 */ { 0x1, 5 }, /* str=00001 */ { 0x1, 6 }, /* str=000001 */ { 0x1, 7 }, /* str=0000001 */ { 0x1, 8 }, /* str=00000001 */ { 0x1, 9 }, /* str=000000001 */ { 0x1, 10 }, /* str=0000000001 */ { 0x1, 11 }, /* str=00000000001 */ }, }; /* psy_trellis_init() has the largest size requirement of 16*FDEC_STRIDE*SIZEOF_PIXEL */ ALIGNED_64( uint8_t x264_zero[1024] ) = { 0 }; x264-master/common/tables.h000066400000000000000000000100351502133446700157710ustar00rootroot00000000000000/***************************************************************************** * tables.h: const tables ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_TABLES_H #define X264_TABLES_H typedef struct { uint8_t i_bits; uint8_t i_size; } vlc_t; X264_API extern const x264_level_t x264_levels[]; extern const uint8_t x264_exp2_lut[64]; extern const float x264_log2_lut[128]; extern const float x264_log2_lz_lut[32]; #define QP_MAX_MAX (51+6*2+18) extern const uint16_t x264_lambda_tab[QP_MAX_MAX+1]; extern const int x264_lambda2_tab[QP_MAX_MAX+1]; extern const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1]; #define MAX_CHROMA_LAMBDA_OFFSET 36 extern const uint16_t x264_chroma_lambda2_offset_tab[MAX_CHROMA_LAMBDA_OFFSET+1]; extern const uint8_t x264_hpel_ref0[16]; extern const uint8_t x264_hpel_ref1[16]; extern const uint8_t x264_cqm_jvt4i[16]; extern const uint8_t x264_cqm_jvt4p[16]; extern const uint8_t x264_cqm_jvt8i[64]; extern const uint8_t x264_cqm_jvt8p[64]; extern const uint8_t x264_cqm_flat16[64]; extern const uint8_t * const x264_cqm_jvt[8]; extern const uint8_t x264_cqm_avci50_4ic[16]; extern const uint8_t x264_cqm_avci50_p_8iy[64]; extern const uint8_t x264_cqm_avci50_1080i_8iy[64]; extern const uint8_t x264_cqm_avci100_720p_4ic[16]; extern const uint8_t x264_cqm_avci100_720p_8iy[64]; extern const uint8_t x264_cqm_avci100_1080_4ic[16]; extern const uint8_t x264_cqm_avci100_1080i_8iy[64]; extern const uint8_t x264_cqm_avci100_1080p_8iy[64]; extern const uint8_t x264_cqm_avci300_2160p_4iy[16]; extern const uint8_t x264_cqm_avci300_2160p_4ic[16]; extern const uint8_t x264_cqm_avci300_2160p_8iy[64]; extern const uint8_t x264_decimate_table4[16]; extern const uint8_t x264_decimate_table8[64]; extern const uint32_t x264_dct4_weight_tab[16]; extern const uint32_t x264_dct8_weight_tab[64]; extern const uint32_t x264_dct4_weight2_tab[16]; extern const uint32_t x264_dct8_weight2_tab[64]; extern const int8_t x264_cabac_context_init_I[1024][2]; extern const int8_t x264_cabac_context_init_PB[3][1024][2]; extern const uint8_t x264_cabac_range_lps[64][4]; extern const uint8_t x264_cabac_transition[128][2]; extern const uint8_t x264_cabac_renorm_shift[64]; extern const uint16_t x264_cabac_entropy[128]; extern const uint8_t x264_significant_coeff_flag_offset_8x8[2][64]; extern const uint8_t x264_last_coeff_flag_offset_8x8[63]; extern const uint8_t x264_coeff_flag_offset_chroma_422_dc[7]; extern const uint16_t x264_significant_coeff_flag_offset[2][16]; extern const uint16_t x264_last_coeff_flag_offset[2][16]; extern const uint16_t x264_coeff_abs_level_m1_offset[16]; extern const uint8_t x264_count_cat_m1[14]; extern const vlc_t x264_coeff0_token[6]; extern const vlc_t x264_coeff_token[6][16][4]; extern const vlc_t x264_total_zeros[15][16]; extern const vlc_t x264_total_zeros_2x2_dc[3][4]; extern const vlc_t x264_total_zeros_2x4_dc[7][8]; extern const vlc_t x264_run_before_init[7][16]; extern uint8_t x264_zero[1024]; #endif x264-master/common/threadpool.c000066400000000000000000000123101502133446700166510ustar00rootroot00000000000000/***************************************************************************** * threadpool.c: thread pooling ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" typedef struct { void *(*func)(void *); void *arg; void *ret; } x264_threadpool_job_t; struct x264_threadpool_t { volatile int exit; int threads; x264_pthread_t *thread_handle; /* requires a synchronized list structure and associated methods, so use what is already implemented for frames */ x264_sync_frame_list_t uninit; /* list of jobs that are awaiting use */ x264_sync_frame_list_t run; /* list of jobs that are queued for processing by the pool */ x264_sync_frame_list_t done; /* list of jobs that have finished processing */ }; REALIGN_STACK static void *threadpool_thread( x264_threadpool_t *pool ) { while( !pool->exit ) { x264_threadpool_job_t *job = NULL; x264_pthread_mutex_lock( &pool->run.mutex ); while( !pool->exit && !pool->run.i_size ) x264_pthread_cond_wait( &pool->run.cv_fill, &pool->run.mutex ); if( pool->run.i_size ) { job = (void*)x264_frame_shift( pool->run.list ); pool->run.i_size--; } x264_pthread_mutex_unlock( &pool->run.mutex ); if( !job ) continue; job->ret = job->func( job->arg ); x264_sync_frame_list_push( &pool->done, (void*)job ); } return NULL; } int x264_threadpool_init( x264_threadpool_t **p_pool, int threads ) { if( threads <= 0 ) return -1; if( x264_threading_init() < 0 ) return -1; x264_threadpool_t *pool; CHECKED_MALLOCZERO( pool, sizeof(x264_threadpool_t) ); *p_pool = pool; pool->threads = threads; CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) ); if( x264_sync_frame_list_init( &pool->uninit, pool->threads ) || x264_sync_frame_list_init( &pool->run, pool->threads ) || x264_sync_frame_list_init( &pool->done, pool->threads ) ) goto fail; for( int i = 0; i < pool->threads; i++ ) { x264_threadpool_job_t *job; CHECKED_MALLOC( job, sizeof(x264_threadpool_job_t) ); x264_sync_frame_list_push( &pool->uninit, (void*)job ); } for( int i = 0; i < pool->threads; i++ ) if( x264_pthread_create( pool->thread_handle+i, NULL, (void*)threadpool_thread, pool ) ) goto fail; return 0; fail: return -1; } void x264_threadpool_run( x264_threadpool_t *pool, void *(*func)(void *), void *arg ) { x264_threadpool_job_t *job = (void*)x264_sync_frame_list_pop( &pool->uninit ); job->func = func; job->arg = arg; x264_sync_frame_list_push( &pool->run, (void*)job ); } void *x264_threadpool_wait( x264_threadpool_t *pool, void *arg ) { x264_pthread_mutex_lock( &pool->done.mutex ); while( 1 ) { for( int i = 0; i < pool->done.i_size; i++ ) if( ((x264_threadpool_job_t*)pool->done.list[i])->arg == arg ) { x264_threadpool_job_t *job = (void*)x264_frame_shift( pool->done.list+i ); pool->done.i_size--; x264_pthread_mutex_unlock( &pool->done.mutex ); void *ret = job->ret; x264_sync_frame_list_push( &pool->uninit, (void*)job ); return ret; } x264_pthread_cond_wait( &pool->done.cv_fill, &pool->done.mutex ); } } static void threadpool_list_delete( x264_sync_frame_list_t *slist ) { for( int i = 0; slist->list[i]; i++ ) { x264_free( slist->list[i] ); slist->list[i] = NULL; } x264_sync_frame_list_delete( slist ); } void x264_threadpool_delete( x264_threadpool_t *pool ) { x264_pthread_mutex_lock( &pool->run.mutex ); pool->exit = 1; x264_pthread_cond_broadcast( &pool->run.cv_fill ); x264_pthread_mutex_unlock( &pool->run.mutex ); for( int i = 0; i < pool->threads; i++ ) x264_pthread_join( pool->thread_handle[i], NULL ); threadpool_list_delete( &pool->uninit ); threadpool_list_delete( &pool->run ); threadpool_list_delete( &pool->done ); x264_free( pool->thread_handle ); x264_free( pool ); } x264-master/common/threadpool.h000066400000000000000000000040221502133446700166570ustar00rootroot00000000000000/***************************************************************************** * threadpool.h: thread pooling ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_THREADPOOL_H #define X264_THREADPOOL_H typedef struct x264_threadpool_t x264_threadpool_t; #if HAVE_THREAD #define x264_threadpool_init x264_template(threadpool_init) X264_API int x264_threadpool_init( x264_threadpool_t **p_pool, int threads ); #define x264_threadpool_run x264_template(threadpool_run) X264_API void x264_threadpool_run( x264_threadpool_t *pool, void *(*func)(void *), void *arg ); #define x264_threadpool_wait x264_template(threadpool_wait) X264_API void *x264_threadpool_wait( x264_threadpool_t *pool, void *arg ); #define x264_threadpool_delete x264_template(threadpool_delete) X264_API void x264_threadpool_delete( x264_threadpool_t *pool ); #else #define x264_threadpool_init(p,t) -1 #define x264_threadpool_run(p,f,a) #define x264_threadpool_wait(p,a) NULL #define x264_threadpool_delete(p) #endif #endif x264-master/common/vlc.c000066400000000000000000000074411502133446700153050ustar00rootroot00000000000000/***************************************************************************** * vlc.c : vlc tables ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common.h" vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; uint32_t x264_run_before[1<<16]; void x264_cavlc_init( x264_t *h ) { for( int i_suffix = 0; i_suffix < 7; i_suffix++ ) for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ ) { int mask = level >> 15; int abs_level = (level^mask)-mask; int i_level_code = abs_level ? abs_level*2-mask-2 : 0; int i_next = i_suffix; vlc_large_t *vlc = &x264_level_token[i_suffix][level+LEVEL_TABLE_SIZE/2]; if( ( i_level_code >> i_suffix ) < 14 ) { vlc->i_size = (i_level_code >> i_suffix) + 1 + i_suffix; vlc->i_bits = (1<i_size = 19; vlc->i_bits = (1<<4) + (i_level_code - 14); } else if( i_suffix > 0 && ( i_level_code >> i_suffix ) == 14 ) { vlc->i_size = 15 + i_suffix; vlc->i_bits = (1<i_size = 28; vlc->i_bits = (1<<12) + i_level_code; } if( i_next == 0 ) i_next++; if( abs_level > (3 << (i_next-1)) && i_next < 6 ) i_next++; vlc->i_next = i_next; } x264_run_before[0] = 0; x264_run_before[1] = 0; for( uint32_t i = 2; i < (1<<16); i++ ) { x264_run_level_t runlevel; ALIGNED_ARRAY_16( dctcoef, dct, [16] ); int size = 0; int bits = 0; for( int j = 0; j < 16; j++ ) dct[j] = i&(1<quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel ); int zeros = runlevel.last + 1 - total; uint32_t mask = i << (x264_clz( i ) + 1); for( int j = 0; j < total-1 && zeros > 0; j++ ) { int idx = X264_MIN(zeros, 7) - 1; int run = x264_clz( mask ); int len = x264_run_before_init[idx][run].i_size; size += len; bits <<= len; bits |= x264_run_before_init[idx][run].i_bits; zeros -= run; mask <<= run + 1; } x264_run_before[i] = (bits << 5) + size; } } x264-master/common/win32thread.c000066400000000000000000000300321502133446700166430ustar00rootroot00000000000000/***************************************************************************** * win32thread.c: windows threading ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * Pegasys Inc. * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* Microsoft's way of supporting systems with >64 logical cpus can be found at * http://www.microsoft.com/whdc/system/Sysinternals/MoreThan64proc.mspx */ /* Based on the agreed standing that x264 does not need to utilize >64 logical cpus, * this API does not detect nor utilize more than 64 cpus for systems that have them. */ #include "base.h" #if HAVE_WINRT /* _beginthreadex() is technically the correct option, but it's only available for Desktop applications. * Using CreateThread() as an alternative works on Windows Store and Windows Phone 8.1+ as long as we're * using a dynamically linked MSVCRT which happens to be a requirement for WinRT applications anyway */ #define _beginthreadex CreateThread #define InitializeCriticalSectionAndSpinCount(a, b) InitializeCriticalSectionEx(a, b, CRITICAL_SECTION_NO_DEBUG_INFO) #define WaitForSingleObject(a, b) WaitForSingleObjectEx(a, b, FALSE) #else #include #endif /* number of times to spin a thread about to block on a locked mutex before retrying and sleeping if still locked */ #define X264_SPIN_COUNT 0 /* global mutex for replacing MUTEX_INITIALIZER instances */ static x264_pthread_mutex_t static_mutex; /* _beginthreadex requires that the start routine is __stdcall */ static unsigned __stdcall win32thread_worker( void *arg ) { x264_pthread_t *h = arg; *h->p_ret = h->func( h->arg ); return 0; } int x264_pthread_create( x264_pthread_t *thread, const x264_pthread_attr_t *attr, void *(*start_routine)( void* ), void *arg ) { thread->func = start_routine; thread->arg = arg; thread->p_ret = &thread->ret; thread->ret = NULL; thread->handle = (void*)_beginthreadex( NULL, 0, win32thread_worker, thread, 0, NULL ); return !thread->handle; } int x264_pthread_join( x264_pthread_t thread, void **value_ptr ) { DWORD ret = WaitForSingleObject( thread.handle, INFINITE ); if( ret != WAIT_OBJECT_0 ) return -1; if( value_ptr ) *value_ptr = *thread.p_ret; CloseHandle( thread.handle ); return 0; } int x264_pthread_mutex_init( x264_pthread_mutex_t *mutex, const x264_pthread_mutexattr_t *attr ) { return !InitializeCriticalSectionAndSpinCount( mutex, X264_SPIN_COUNT ); } int x264_pthread_mutex_destroy( x264_pthread_mutex_t *mutex ) { DeleteCriticalSection( mutex ); return 0; } int x264_pthread_mutex_lock( x264_pthread_mutex_t *mutex ) { static const x264_pthread_mutex_t init = X264_PTHREAD_MUTEX_INITIALIZER; if( !memcmp( mutex, &init, sizeof(x264_pthread_mutex_t) ) ) { int ret = 0; EnterCriticalSection( &static_mutex ); if( !memcmp( mutex, &init, sizeof(x264_pthread_mutex_t) ) ) ret = x264_pthread_mutex_init( mutex, NULL ); LeaveCriticalSection( &static_mutex ); if( ret ) return ret; } EnterCriticalSection( mutex ); return 0; } int x264_pthread_mutex_unlock( x264_pthread_mutex_t *mutex ) { LeaveCriticalSection( mutex ); return 0; } void x264_win32_threading_destroy( void ) { x264_pthread_mutex_destroy( &static_mutex ); memset( &static_mutex, 0, sizeof(static_mutex) ); } #if HAVE_WINRT int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr ) { InitializeConditionVariable( cond ); return 0; } int x264_pthread_cond_destroy( x264_pthread_cond_t *cond ) { return 0; } int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond ) { WakeAllConditionVariable( cond ); return 0; } int x264_pthread_cond_signal( x264_pthread_cond_t *cond ) { WakeConditionVariable( cond ); return 0; } int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex ) { return !SleepConditionVariableCS( cond, mutex, INFINITE ); } int x264_win32_threading_init( void ) { return x264_pthread_mutex_init( &static_mutex, NULL ); } int x264_pthread_num_processors_np( void ) { SYSTEM_INFO si; GetNativeSystemInfo(&si); return si.dwNumberOfProcessors; } #else static struct { /* function pointers to conditional variable API on windows 6.0+ kernels */ void (WINAPI *cond_broadcast)( x264_pthread_cond_t *cond ); void (WINAPI *cond_init)( x264_pthread_cond_t *cond ); void (WINAPI *cond_signal)( x264_pthread_cond_t *cond ); BOOL (WINAPI *cond_wait)( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex, DWORD milliseconds ); } thread_control; /* for pre-Windows 6.0 platforms we need to define and use our own condition variable and api */ typedef struct { x264_pthread_mutex_t mtx_broadcast; x264_pthread_mutex_t mtx_waiter_count; volatile int waiter_count; HANDLE semaphore; HANDLE waiters_done; volatile int is_broadcast; } x264_win32_cond_t; int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr ) { if( thread_control.cond_init ) { thread_control.cond_init( cond ); return 0; } /* non native condition variables */ x264_win32_cond_t *win32_cond = calloc( 1, sizeof(x264_win32_cond_t) ); if( !win32_cond ) return -1; cond->Ptr = win32_cond; win32_cond->semaphore = CreateSemaphoreW( NULL, 0, 0x7fffffff, NULL ); if( !win32_cond->semaphore ) return -1; if( x264_pthread_mutex_init( &win32_cond->mtx_waiter_count, NULL ) ) return -1; if( x264_pthread_mutex_init( &win32_cond->mtx_broadcast, NULL ) ) return -1; win32_cond->waiters_done = CreateEventW( NULL, FALSE, FALSE, NULL ); if( !win32_cond->waiters_done ) return -1; return 0; } int x264_pthread_cond_destroy( x264_pthread_cond_t *cond ) { /* native condition variables do not destroy */ if( thread_control.cond_init ) return 0; /* non native condition variables */ x264_win32_cond_t *win32_cond = cond->Ptr; CloseHandle( win32_cond->semaphore ); CloseHandle( win32_cond->waiters_done ); x264_pthread_mutex_destroy( &win32_cond->mtx_broadcast ); x264_pthread_mutex_destroy( &win32_cond->mtx_waiter_count ); free( win32_cond ); return 0; } int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond ) { if( thread_control.cond_broadcast ) { thread_control.cond_broadcast( cond ); return 0; } /* non native condition variables */ x264_win32_cond_t *win32_cond = cond->Ptr; x264_pthread_mutex_lock( &win32_cond->mtx_broadcast ); x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count ); int have_waiter = 0; if( win32_cond->waiter_count ) { win32_cond->is_broadcast = 1; have_waiter = 1; } if( have_waiter ) { ReleaseSemaphore( win32_cond->semaphore, win32_cond->waiter_count, NULL ); x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count ); WaitForSingleObject( win32_cond->waiters_done, INFINITE ); win32_cond->is_broadcast = 0; } else x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count ); return x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast ); } int x264_pthread_cond_signal( x264_pthread_cond_t *cond ) { if( thread_control.cond_signal ) { thread_control.cond_signal( cond ); return 0; } /* non-native condition variables */ x264_win32_cond_t *win32_cond = cond->Ptr; x264_pthread_mutex_lock( &win32_cond->mtx_broadcast ); x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count ); int have_waiter = win32_cond->waiter_count; x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count ); if( have_waiter ) { ReleaseSemaphore( win32_cond->semaphore, 1, NULL ); WaitForSingleObject( win32_cond->waiters_done, INFINITE ); } return x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast ); } int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex ) { if( thread_control.cond_wait ) return !thread_control.cond_wait( cond, mutex, INFINITE ); /* non native condition variables */ x264_win32_cond_t *win32_cond = cond->Ptr; x264_pthread_mutex_lock( &win32_cond->mtx_broadcast ); x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count ); win32_cond->waiter_count++; x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count ); x264_pthread_mutex_unlock( &win32_cond->mtx_broadcast ); // unlock the external mutex x264_pthread_mutex_unlock( mutex ); WaitForSingleObject( win32_cond->semaphore, INFINITE ); x264_pthread_mutex_lock( &win32_cond->mtx_waiter_count ); win32_cond->waiter_count--; int last_waiter = !win32_cond->waiter_count || !win32_cond->is_broadcast; x264_pthread_mutex_unlock( &win32_cond->mtx_waiter_count ); if( last_waiter ) SetEvent( win32_cond->waiters_done ); // lock the external mutex return x264_pthread_mutex_lock( mutex ); } int x264_win32_threading_init( void ) { /* find function pointers to API functions, if they exist */ HANDLE kernel_dll = GetModuleHandleW( L"kernel32.dll" ); thread_control.cond_init = (void*)GetProcAddress( kernel_dll, "InitializeConditionVariable" ); if( thread_control.cond_init ) { /* we're on a windows 6.0+ kernel, acquire the rest of the functions */ thread_control.cond_broadcast = (void*)GetProcAddress( kernel_dll, "WakeAllConditionVariable" ); thread_control.cond_signal = (void*)GetProcAddress( kernel_dll, "WakeConditionVariable" ); thread_control.cond_wait = (void*)GetProcAddress( kernel_dll, "SleepConditionVariableCS" ); } return x264_pthread_mutex_init( &static_mutex, NULL ); } int x264_pthread_num_processors_np( void ) { DWORD_PTR system_cpus, process_cpus = 0; int cpus = 0; /* GetProcessAffinityMask returns affinities of 0 when the process has threads in multiple processor groups. * On platforms that support processor grouping, use GetThreadGroupAffinity to get the current thread's affinity instead. */ #if ARCH_X86_64 /* find function pointers to API functions specific to x86_64 platforms, if they exist */ HANDLE kernel_dll = GetModuleHandleW( L"kernel32.dll" ); BOOL (*get_thread_affinity)( HANDLE thread, void *group_affinity ) = (void*)GetProcAddress( kernel_dll, "GetThreadGroupAffinity" ); if( get_thread_affinity ) { /* running on a platform that supports >64 logical cpus */ struct /* GROUP_AFFINITY */ { ULONG_PTR mask; // KAFFINITY = ULONG_PTR USHORT group; USHORT reserved[3]; } thread_affinity; if( get_thread_affinity( GetCurrentThread(), &thread_affinity ) ) process_cpus = thread_affinity.mask; } #endif if( !process_cpus ) GetProcessAffinityMask( GetCurrentProcess(), &process_cpus, &system_cpus ); for( DWORD_PTR bit = 1; bit; bit <<= 1 ) cpus += !!(process_cpus & bit); return cpus ? cpus : 1; } #endif x264-master/common/win32thread.h000066400000000000000000000056511502133446700166610ustar00rootroot00000000000000/***************************************************************************** * win32thread.h: windows threading ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_WIN32THREAD_H #define X264_WIN32THREAD_H #include /* the following macro is used within x264 */ #undef ERROR typedef struct { void *handle; void *(*func)( void* arg ); void *arg; void **p_ret; void *ret; } x264_pthread_t; #define x264_pthread_attr_t int /* the conditional variable api for windows 6.0+ uses critical sections and not mutexes */ typedef CRITICAL_SECTION x264_pthread_mutex_t; #define X264_PTHREAD_MUTEX_INITIALIZER {0} #define x264_pthread_mutexattr_t int #if HAVE_WINRT typedef CONDITION_VARIABLE x264_pthread_cond_t; #else typedef struct { void *Ptr; } x264_pthread_cond_t; #endif #define x264_pthread_condattr_t int int x264_pthread_create( x264_pthread_t *thread, const x264_pthread_attr_t *attr, void *(*start_routine)( void* ), void *arg ); int x264_pthread_join( x264_pthread_t thread, void **value_ptr ); int x264_pthread_mutex_init( x264_pthread_mutex_t *mutex, const x264_pthread_mutexattr_t *attr ); int x264_pthread_mutex_destroy( x264_pthread_mutex_t *mutex ); int x264_pthread_mutex_lock( x264_pthread_mutex_t *mutex ); int x264_pthread_mutex_unlock( x264_pthread_mutex_t *mutex ); int x264_pthread_cond_init( x264_pthread_cond_t *cond, const x264_pthread_condattr_t *attr ); int x264_pthread_cond_destroy( x264_pthread_cond_t *cond ); int x264_pthread_cond_broadcast( x264_pthread_cond_t *cond ); int x264_pthread_cond_wait( x264_pthread_cond_t *cond, x264_pthread_mutex_t *mutex ); int x264_pthread_cond_signal( x264_pthread_cond_t *cond ); #define x264_pthread_attr_init(a) 0 #define x264_pthread_attr_destroy(a) 0 int x264_win32_threading_init( void ); void x264_win32_threading_destroy( void ); int x264_pthread_num_processors_np( void ); #endif x264-master/common/x86/000077500000000000000000000000001502133446700147745ustar00rootroot00000000000000x264-master/common/x86/bitstream-a.asm000066400000000000000000000073611502133446700177150ustar00rootroot00000000000000;***************************************************************************** ;* bitstream-a.asm: x86 bitstream functions ;***************************************************************************** ;* Copyright (C) 2010-2025 x264 project ;* ;* Authors: Fiona Glaser ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text ;----------------------------------------------------------------------------- ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end ) ;----------------------------------------------------------------------------- %macro NAL_LOOP 2 %%escape: ; Detect false positive to avoid unnecessary escape loop xor r3d, r3d cmp byte [r0+r1-1], 0 setnz r3b xor k3, k4 jnz .escape jmp %%continue ALIGN 16 %1: mova [r0+r1+mmsize], m1 pcmpeqb m1, m0 mova [r0+r1], m2 pcmpeqb m2, m0 pmovmskb r3d, m1 %2 m1, [r1+r2+3*mmsize] pmovmskb r4d, m2 %2 m2, [r1+r2+2*mmsize] shl k3, mmsize or k3, k4 lea k4, [2*r3+1] and k4, k3 jnz %%escape %%continue: add r1, 2*mmsize jl %1 %endmacro %macro NAL_ESCAPE 0 %if mmsize == 32 %xdefine k3 r3 %xdefine k4 r4 %else %xdefine k3 r3d %xdefine k4 r4d %endif cglobal nal_escape, 3,5 movzx r3d, byte [r1] sub r1, r2 ; r1 = offset of current src pointer from end of src pxor m0, m0 mov [r0], r3b sub r0, r1 ; r0 = projected end of dst, assuming no more escapes or r3d, 0xffffff00 ; ignore data before src ; Start off by jumping into the escape loop in case there's an escape at the start. ; And do a few more in scalar until dst is aligned. jmp .escape_loop %if mmsize == 16 NAL_LOOP .loop_aligned, mova jmp .ret %endif NAL_LOOP .loop_unaligned, movu .ret: movifnidn rax, r0 RET .escape: ; Skip bytes that are known to be valid and k4, k3 tzcnt k4, k4 xor r3d, r3d ; the last two bytes are known to be zero add r1, r4 .escape_loop: inc r1 jge .ret movzx r4d, byte [r1+r2] shl r3d, 8 or r3d, r4d test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3 jz .add_escape_byte .escaped: lea r4d, [r0+r1] mov [r0+r1], r3b test r4d, mmsize-1 ; Do SIMD when dst is aligned jnz .escape_loop movu m1, [r1+r2+mmsize] movu m2, [r1+r2] %if mmsize == 16 lea r4d, [r1+r2] test r4d, mmsize-1 jz .loop_aligned %endif jmp .loop_unaligned .add_escape_byte: mov byte [r0+r1], 3 inc r0 or r3d, 0x0300 jmp .escaped %endmacro INIT_MMX mmx2 NAL_ESCAPE INIT_XMM sse2 NAL_ESCAPE %if ARCH_X86_64 INIT_YMM avx2 NAL_ESCAPE %endif x264-master/common/x86/bitstream.h000066400000000000000000000116431502133446700171440ustar00rootroot00000000000000/***************************************************************************** * bitstream.h: x86 bitstream functions ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_BITSTREAM_H #define X264_X86_BITSTREAM_H #define x264_nal_escape_mmx2 x264_template(nal_escape_mmx2) uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end ); #define x264_nal_escape_sse2 x264_template(nal_escape_sse2) uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); #define x264_nal_escape_avx2 x264_template(nal_escape_avx2) uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); #define x264_cabac_block_residual_rd_internal_sse2 x264_template(cabac_block_residual_rd_internal_sse2) void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_rd_internal_lzcnt x264_template(cabac_block_residual_rd_internal_lzcnt) void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_rd_internal_ssse3 x264_template(cabac_block_residual_rd_internal_ssse3) void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_rd_internal_ssse3_lzcnt x264_template(cabac_block_residual_rd_internal_ssse3_lzcnt) void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_rd_internal_avx512 x264_template(cabac_block_residual_rd_internal_avx512) void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_8x8_rd_internal_sse2 x264_template(cabac_block_residual_8x8_rd_internal_sse2) void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_8x8_rd_internal_lzcnt x264_template(cabac_block_residual_8x8_rd_internal_lzcnt) void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_8x8_rd_internal_ssse3 x264_template(cabac_block_residual_8x8_rd_internal_ssse3) void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt x264_template(cabac_block_residual_8x8_rd_internal_ssse3_lzcnt) void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_8x8_rd_internal_avx512 x264_template(cabac_block_residual_8x8_rd_internal_avx512) void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_internal_sse2 x264_template(cabac_block_residual_internal_sse2) void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_internal_lzcnt x264_template(cabac_block_residual_internal_lzcnt) void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_internal_avx2 x264_template(cabac_block_residual_internal_avx2) void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #define x264_cabac_block_residual_internal_avx512 x264_template(cabac_block_residual_internal_avx512) void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); #endif x264-master/common/x86/cabac-a.asm000066400000000000000000000501651502133446700167540ustar00rootroot00000000000000;***************************************************************************** ;* cabac-a.asm: x86 cabac ;***************************************************************************** ;* Copyright (C) 2008-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Holger Lubitz ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 64 %if ARCH_X86_64 %macro COEFF_LAST_TABLE 4-18 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 %xdefine %%funccpu1 %2 ; last4 %xdefine %%funccpu2 %3 ; last64 %xdefine %%funccpu3 %4 ; last15/last16 coeff_last_%1: %xdefine %%base coeff_last_%1 %rep 14 %ifidn %5, 4 dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base %elifidn %5, 64 dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu2) - %%base %else dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu3) - %%base %endif %rotate 1 %endrep dd 0, 0 ; 64-byte alignment padding %endmacro cextern coeff_last4_mmx2 cextern coeff_last4_lzcnt %if HIGH_BIT_DEPTH cextern coeff_last4_avx512 %endif cextern coeff_last15_sse2 cextern coeff_last15_lzcnt cextern coeff_last15_avx512 cextern coeff_last16_sse2 cextern coeff_last16_lzcnt cextern coeff_last16_avx512 cextern coeff_last64_sse2 cextern coeff_last64_lzcnt cextern coeff_last64_avx2 cextern coeff_last64_avx512 COEFF_LAST_TABLE sse2, mmx2, sse2, sse2 COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, lzcnt COEFF_LAST_TABLE avx2, lzcnt, avx2, lzcnt %if HIGH_BIT_DEPTH COEFF_LAST_TABLE avx512, avx512, avx512, avx512 %else COEFF_LAST_TABLE avx512, lzcnt, avx512, avx512 %endif %endif coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0 coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9 coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 db 4, 4, 4, 4, 5, 6, 7, 7 SECTION .text cextern_common cabac_range_lps cextern_common cabac_transition cextern_common cabac_renorm_shift cextern_common cabac_entropy cextern cabac_size_unary cextern cabac_transition_unary cextern_common significant_coeff_flag_offset cextern_common significant_coeff_flag_offset_8x8 cextern_common last_coeff_flag_offset cextern_common last_coeff_flag_offset_8x8 cextern_common coeff_abs_level_m1_offset cextern_common count_cat_m1 cextern cabac_encode_ue_bypass %if ARCH_X86_64 %define pointer resq %else %define pointer resd %endif struc cb .low: resd 1 .range: resd 1 .queue: resd 1 .bytes_outstanding: resd 1 .start: pointer 1 .p: pointer 1 .end: pointer 1 align 64, resb 1 .bits_encoded: resd 1 .state: resb 1024 endstruc %macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp %if ARCH_X86_64 == 0 movzx %1, byte [%2+%3+%4] %elifidn %4, 0 movzx %1, byte [%2+%3+r7-$$] %else lea %5, [r7+%4] movzx %1, byte [%2+%3+%5-$$] %endif %endmacro %macro CABAC 1 ; t3 must be ecx, since it's used for shift. %if WIN64 DECLARE_REG_TMP 3,1,2,0,5,6,4,4 %elif ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6,6 %else DECLARE_REG_TMP 0,4,2,1,3,5,6,2 %endif cglobal cabac_encode_decision_%1, 1,7 movifnidn t1d, r1m mov t5d, [r0+cb.range] movzx t6d, byte [r0+cb.state+t1] movifnidn t0, r0 ; WIN64 mov t4d, ~1 mov t3d, t5d and t4d, t6d shr t5d, 6 movifnidn t2d, r2m %if WIN64 PUSH r7 %endif %if ARCH_X86_64 lea r7, [$$] %endif LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4 LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4 and t6d, 1 sub t3d, t5d cmp t6d, t2d mov t6d, [t0+cb.low] lea t2, [t6+t3] cmovne t3d, t5d cmovne t6d, t2d mov [t0+cb.state+t1], t4b ;cabac_encode_renorm mov t4d, t3d %ifidn %1, bmi2 lzcnt t3d, t3d sub t3d, 23 shlx t4d, t4d, t3d shlx t6d, t6d, t3d %else shr t3d, 3 LOAD_GLOBAL t3d, cabac_renorm_shift, t3 shl t4d, t3b shl t6d, t3b %endif %if WIN64 POP r7 %endif mov [t0+cb.range], t4d add t3d, [t0+cb.queue] jge cabac_putbyte_%1 .update_queue_low: mov [t0+cb.low], t6d mov [t0+cb.queue], t3d RET cglobal cabac_encode_bypass_%1, 2,3 mov t7d, [r0+cb.low] and r1d, [r0+cb.range] lea t7d, [t7*2+r1] movifnidn t0, r0 ; WIN64 mov t3d, [r0+cb.queue] inc t3d %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp jge cabac_putbyte_%1 %else jge .putbyte %endif mov [t0+cb.low], t7d mov [t0+cb.queue], t3d RET %if ARCH_X86_64 == 0 .putbyte: PROLOGUE 0,7 movifnidn t6d, t7d jmp cabac_putbyte_%1 %endif %ifnidn %1,bmi2 cglobal cabac_encode_terminal_%1, 1,3 sub dword [r0+cb.range], 2 ; shortcut: the renormalization shift in terminal ; can only be 0 or 1 and is zero over 99% of the time. test dword [r0+cb.range], 0x100 je .renorm RET .renorm: shl dword [r0+cb.low], 1 shl dword [r0+cb.range], 1 inc dword [r0+cb.queue] jge .putbyte RET .putbyte: PROLOGUE 0,7 movifnidn t0, r0 ; WIN64 mov t3d, [r0+cb.queue] mov t6d, [t0+cb.low] %endif cabac_putbyte_%1: ; alive: t0=cb t3=queue t6=low %if WIN64 DECLARE_REG_TMP 3,6,1,0,2,5,4 %endif %ifidn %1, bmi2 add t3d, 10 shrx t2d, t6d, t3d bzhi t6d, t6d, t3d sub t3d, 18 %else mov t1d, -1 add t3d, 10 mov t2d, t6d shl t1d, t3b shr t2d, t3b ; out not t1d sub t3d, 18 and t6d, t1d %endif mov t5d, [t0+cb.bytes_outstanding] cmp t2b, 0xff ; FIXME is a 32bit op faster? jz .postpone mov t1, [t0+cb.p] add [t1-1], t2h dec t2h .loop_outstanding: mov [t1], t2h inc t1 dec t5d jge .loop_outstanding mov [t1-1], t2b mov [t0+cb.p], t1 .postpone: inc t5d mov [t0+cb.bytes_outstanding], t5d jmp mangle(private_prefix %+ _cabac_encode_decision_%1.update_queue_low) %endmacro CABAC asm CABAC bmi2 %if ARCH_X86_64 ; %1 = label name ; %2 = node_ctx init? %macro COEFF_ABS_LEVEL_GT1 2 %if %2 %define ctx 1 %else movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL] %define ctx r11 %endif movzx r9d, byte [r8+ctx] ; if( coeff_abs > 1 ) cmp r1d, 1 jg .%1_gt1 ; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 ) movzx r10d, byte [cabac_transition+r9*2 GLOBAL] movzx r9d, word [cabac_entropy+r9*2 GLOBAL] lea r0d, [r0+r9+256] mov [r8+ctx], r10b %if %2 mov r2d, 1 %else movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL] %endif jmp .%1_end .%1_gt1: ; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 ) movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL] xor r9d, 1 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] mov [r8+ctx], r10b add r0d, r9d %if %2 %define ctx 5 %else movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL] %define ctx r11 %endif ; if( coeff_abs < 15 ) cmp r1d, 15 jge .%1_escape shl r1d, 7 ; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]] movzx r9d, byte [r8+ctx] add r9d, r1d movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL] ; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]] movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL] mov [r8+ctx], r10b add r0d, r9d jmp .%1_gt1_end .%1_escape: ; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]] movzx r9d, byte [r8+ctx] movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL] ; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]] movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL] add r0d, r9d mov [r8+ctx], r10b sub r1d, 14 %if cpuflag(lzcnt) lzcnt r9d, r1d xor r9d, 0x1f %else bsr r9d, r1d %endif ; bs_size_ue_big(coeff_abs-15)<<8 shl r9d, 9 ; (ilog2(coeff_abs-14)+1) << 8 lea r0d, [r0+r9+256] .%1_gt1_end: %if %2 mov r2d, 4 %else movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL] %endif .%1_end: %endmacro %macro LOAD_DCTCOEF 1 %if HIGH_BIT_DEPTH mov %1, [dct+r6*4] %else movzx %1, word [dct+r6*2] %endif %endmacro %macro ABS_DCTCOEFS 2 %if HIGH_BIT_DEPTH %define %%abs ABSD %else %define %%abs ABSW %endif %if mmsize == %2*SIZEOF_DCTCOEF %%abs m0, [%1], m1 mova [rsp], m0 %elif mmsize == %2*SIZEOF_DCTCOEF/2 %%abs m0, [%1+0*mmsize], m2 %%abs m1, [%1+1*mmsize], m3 mova [rsp+0*mmsize], m0 mova [rsp+1*mmsize], m1 %else %assign i 0 %rep %2*SIZEOF_DCTCOEF/(4*mmsize) %%abs m0, [%1+(4*i+0)*mmsize], m4 %%abs m1, [%1+(4*i+1)*mmsize], m5 %%abs m2, [%1+(4*i+2)*mmsize], m4 %%abs m3, [%1+(4*i+3)*mmsize], m5 mova [rsp+(4*i+0)*mmsize], m0 mova [rsp+(4*i+1)*mmsize], m1 mova [rsp+(4*i+2)*mmsize], m2 mova [rsp+(4*i+3)*mmsize], m3 %assign i i+1 %endrep %endif %endmacro %macro SIG_OFFSET 1 %if %1 movzx r11d, byte [r4+r6] %endif %endmacro %macro LAST_OFFSET 1 %if %1 movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL] %endif %endmacro %macro COEFF_LAST 2 ; table, ctx_block_cat lea r1, [%1 GLOBAL] movsxd r6, [r1+4*%2] add r6, r1 call r6 %endmacro ;----------------------------------------------------------------------------- ; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, ; int ctx_block_cat, x264_cabac_t *cb ); ;----------------------------------------------------------------------------- ;%1 = 8x8 mode %macro CABAC_RESIDUAL_RD 2 %if %1 %define func cabac_block_residual_8x8_rd_internal %define maxcoeffs 64 %define dct rsp %else %define func cabac_block_residual_rd_internal %define maxcoeffs 16 %define dct r4 %endif cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF lea r12, [$$] %define GLOBAL +r12-$$ shl r1d, 4 ; MB_INTERLACED*16 %if %1 lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8 %endif add r1d, r2d movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level ; abs() all the coefficients; copy them to the stack to avoid ; changing the originals. ; overreading is okay; it's all valid aligned data anyways. %if %1 ABS_DCTCOEFS r0, 64 %else mov r4, r0 ; r4 = dct and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case ABS_DCTCOEFS r4, 16 xor r4, r0 ; calculate our new dct pointer add r4, rsp ; restore AC coefficient offset %endif ; for improved OOE performance, run coeff_last on the original coefficients. COEFF_LAST %2, r2 ; coeff_last[ctx_block_cat]( dct ) ; we know on 64-bit that the SSE2 versions of this function only ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we ; don't need r2 in 8x8 mode. mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded ; pre-add some values to simplify addressing add r3, cb.state add r5, r3 add r7, r3 add r8, r3 ; precalculate cabac state pointers ; if( last != count_cat_m1[ctx_block_cat] ) %if %1 cmp r6b, 63 %else cmp r6b, [count_cat_m1+r2 GLOBAL] %endif je .skip_last_sigmap ; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last, ; so we'll use r11 for this. %if %1 %define siglast_ctx r11 %else %define siglast_ctx r6 %endif ; x264_cabac_encode_decision( cb, ctx_sig + last, 1 ) ; x264_cabac_encode_decision( cb, ctx_last + last, 1 ) SIG_OFFSET %1 movzx r1d, byte [r5+siglast_ctx] movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL] xor r1d, 1 movzx r1d, word [cabac_entropy+r1*2 GLOBAL] mov [r5+siglast_ctx], r9b add r0d, r1d LAST_OFFSET %1 movzx r1d, byte [r7+siglast_ctx] movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL] xor r1d, 1 movzx r1d, word [cabac_entropy+r1*2 GLOBAL] mov [r7+siglast_ctx], r9b add r0d, r1d .skip_last_sigmap: LOAD_DCTCOEF r1d COEFF_ABS_LEVEL_GT1 last, 1 ; for( int i = last-1 ; i >= 0; i-- ) dec r6d jl .end .coeff_loop: LOAD_DCTCOEF r1d ; if( l[i] ) SIG_OFFSET %1 movzx r9d, byte [r5+siglast_ctx] test r1d, r1d jnz .coeff_nonzero ; x264_cabac_encode_decision( cb, ctx_sig + i, 0 ) movzx r10d, byte [cabac_transition+r9*2 GLOBAL] movzx r9d, word [cabac_entropy+r9*2 GLOBAL] mov [r5+siglast_ctx], r10b add r0d, r9d dec r6d jge .coeff_loop jmp .end .coeff_nonzero: ; x264_cabac_encode_decision( cb, ctx_sig + i, 1 ) movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL] xor r9d, 1 movzx r9d, word [cabac_entropy+r9*2 GLOBAL] mov [r5+siglast_ctx], r10b add r0d, r9d ; x264_cabac_encode_decision( cb, ctx_last + i, 0 ); LAST_OFFSET %1 movzx r9d, byte [r7+siglast_ctx] movzx r10d, byte [cabac_transition+r9*2 GLOBAL] movzx r9d, word [cabac_entropy+r9*2 GLOBAL] mov [r7+siglast_ctx], r10b add r0d, r9d COEFF_ABS_LEVEL_GT1 coeff, 0 dec r6d jge .coeff_loop .end: mov [r3+cb.bits_encoded-cb.state], r0d RET %endmacro INIT_XMM sse2 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 INIT_XMM lzcnt CABAC_RESIDUAL_RD 0, coeff_last_lzcnt CABAC_RESIDUAL_RD 1, coeff_last_lzcnt INIT_XMM ssse3 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 INIT_XMM ssse3,lzcnt CABAC_RESIDUAL_RD 0, coeff_last_lzcnt CABAC_RESIDUAL_RD 1, coeff_last_lzcnt %if HIGH_BIT_DEPTH INIT_ZMM avx512 %else INIT_YMM avx512 %endif CABAC_RESIDUAL_RD 0, coeff_last_avx512 INIT_ZMM avx512 CABAC_RESIDUAL_RD 1, coeff_last_avx512 ;----------------------------------------------------------------------------- ; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, ; int ctx_block_cat, x264_cabac_t *cb ); ;----------------------------------------------------------------------------- %macro CALL_CABAC 0 %if cpuflag(bmi2) call cabac_encode_decision_bmi2 %else call cabac_encode_decision_asm %endif %if WIN64 ; move cabac back mov r0, r3 %endif %endmacro ; %1 = 8x8 mode ; %2 = dct register ; %3 = countcat ; %4 = name %macro SIGMAP_LOOP 3-4 .sigmap_%4loop: %if HIGH_BIT_DEPTH mov %2, [dct+r10*4] %else movsx %2, word [dct+r10*2] %endif %if %1 movzx r1d, byte [sigoff_8x8 + r10] add r1d, sigoffd %else lea r1d, [sigoffd + r10d] %endif test %2, %2 jz .sigmap_%4zero ; if( l[i] ) inc coeffidxd mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]; mov r2d, 1 CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 ); %if %1 movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL] add r1d, lastoffd %else lea r1d, [lastoffd + r10d] %endif cmp r10d, lastm ; if( i == last ) je .sigmap_%4last xor r2d, r2d CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 ); jmp .sigmap_%4loop_endcheck .sigmap_%4zero: xor r2d, r2d CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 ); .sigmap_%4loop_endcheck: inc r10d cmp r10d, %3 jne .sigmap_%4loop ; if( ++i == count_m1 ) %if HIGH_BIT_DEPTH mov %2, [dct+r10*4] %else movsx %2, word [dct+r10*2] %endif inc coeffidxd mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i] jmp .sigmap_%4end .sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 ); mov r2d, 1 CALL_CABAC .sigmap_%4end: %if %1==0 jmp .level_loop_start %endif %endmacro %macro CABAC_RESIDUAL 1 cglobal cabac_block_residual_internal, 4,15,0,-4*64 ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register. lea r7, [$$] %define lastm [rsp+4*1] %define GLOBAL +r7-$$ shl r1d, 4 %define sigoffq r8 %define sigoffd r8d %define lastoffq r9 %define lastoffd r9d %define leveloffq r10 %define leveloffd r10d %define leveloffm [rsp+4*0] %define countcatd r11d %define sigoff_8x8 r12 %define coeffidxq r13 %define coeffidxd r13d %define dct r14 %define coeffs rsp+4*2 lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] add r1d, r2d movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL] movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL] movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] movzx countcatd, byte [count_cat_m1+r2 GLOBAL] mov coeffidxd, -1 mov dct, r0 mov leveloffm, leveloffd COEFF_LAST %1, r2 mov lastm, eax ; put cabac in r0; needed for cabac_encode_decision mov r0, r3 xor r10d, r10d cmp countcatd, 63 je .sigmap_8x8 SIGMAP_LOOP 0, r12d, countcatd .sigmap_8x8: SIGMAP_LOOP 1, r11d, 63, _8x8 .level_loop_start: ; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop. %define nodectxq r8 %define nodectxd r8d mov leveloffd, leveloffm xor nodectxd, nodectxd .level_loop: mov r9d, [coeffs+coeffidxq*4] mov r11d, r9d sar r11d, 31 add r9d, r11d movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL] xor r9d, r11d add r1d, leveloffd cmp r9d, 1 jg .level_gt1 xor r2d, r2d CALL_CABAC movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL] jmp .level_sign .level_gt1: mov r2d, 1 CALL_CABAC movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL] add r14d, leveloffd cmp r9d, 15 mov r12d, 15 cmovl r12d, r9d sub r12d, 2 jz .level_eq2 .level_gt1_loop: mov r1d, r14d mov r2d, 1 CALL_CABAC dec r12d jg .level_gt1_loop cmp r9d, 15 jge .level_bypass .level_eq2: mov r1d, r14d xor r2d, r2d CALL_CABAC jmp .level_gt1_end .level_bypass: lea r2d, [r9d-15] xor r1d, r1d push r0 ; we could avoid this if we implemented it in asm, but I don't feel like that ; right now. %if UNIX64 push r7 push r8 %else sub rsp, 40 ; shadow space and alignment %endif call cabac_encode_ue_bypass %if UNIX64 pop r8 pop r7 %else add rsp, 40 %endif pop r0 .level_gt1_end: movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL] .level_sign: mov r1d, r11d %if cpuflag(bmi2) call cabac_encode_bypass_bmi2 %else call cabac_encode_bypass_asm %endif %if WIN64 mov r0, r3 %endif dec coeffidxd jge .level_loop RET %endmacro INIT_XMM sse2 CABAC_RESIDUAL coeff_last_sse2 INIT_XMM lzcnt CABAC_RESIDUAL coeff_last_lzcnt INIT_XMM avx2 CABAC_RESIDUAL coeff_last_avx2 INIT_XMM avx512 CABAC_RESIDUAL coeff_last_avx512 %endif x264-master/common/x86/const-a.asm000066400000000000000000000056621502133446700170530ustar00rootroot00000000000000;***************************************************************************** ;* const-a.asm: x86 global constants ;***************************************************************************** ;* Copyright (C) 2010-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" SECTION_RODATA 32 const pb_1, times 32 db 1 const hsub_mul, times 16 db 1, -1 const pw_1, times 16 dw 1 const pw_16, times 16 dw 16 const pw_32, times 16 dw 32 const pw_512, times 16 dw 512 const pw_00ff, times 16 dw 0x00ff const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 const pd_1, times 8 dd 1 const pd_0123, dd 0,1,2,3 const pd_4567, dd 4,5,6,7 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 const pb_01, times 8 db 0,1 const pb_0, times 16 db 0 const pb_a1, times 16 db 0xa1 const pb_3, times 16 db 3 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 const pw_2, times 8 dw 2 const pw_m2, times 8 dw -2 const pw_4, times 8 dw 4 const pw_8, times 8 dw 8 const pw_64, times 8 dw 64 const pw_256, times 8 dw 256 const pw_32_0, times 4 dw 32 times 4 dw 0 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 8 dw 0x3fff const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 const pd_8, times 4 dd 8 const pd_32, times 4 dd 32 const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff const pw_ff00, times 8 dw 0xff00 const popcnt_table %assign x 0 %rep 256 ; population count db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1) %assign x x+1 %endrep const sw_64, dd 64 x264-master/common/x86/cpu-a.asm000066400000000000000000000061531502133446700165100ustar00rootroot00000000000000;***************************************************************************** ;* cpu-a.asm: x86 cpu utilities ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Laurent Aimar ;* Loren Merritt ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" SECTION .text ;----------------------------------------------------------------------------- ; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx ) ;----------------------------------------------------------------------------- cglobal cpu_cpuid, 5,7 push rbx push r4 push r3 push r2 push r1 mov eax, r0d xor ecx, ecx cpuid pop r4 mov [r4], eax pop r4 mov [r4], ebx pop r4 mov [r4], ecx pop r4 mov [r4], edx pop rbx RET ;----------------------------------------------------------------------------- ; uint64_t cpu_xgetbv( int xcr ) ;----------------------------------------------------------------------------- cglobal cpu_xgetbv movifnidn ecx, r0m xgetbv %if ARCH_X86_64 shl rdx, 32 or rax, rdx %endif ret ;----------------------------------------------------------------------------- ; void cpu_emms( void ) ;----------------------------------------------------------------------------- cglobal cpu_emms emms ret ;----------------------------------------------------------------------------- ; void cpu_sfence( void ) ;----------------------------------------------------------------------------- cglobal cpu_sfence sfence ret %if ARCH_X86_64 == 0 ;----------------------------------------------------------------------------- ; int cpu_cpuid_test( void ) ; return 0 if unsupported ;----------------------------------------------------------------------------- cglobal cpu_cpuid_test pushfd push ebx push ebp push esi push edi pushfd pop eax mov ebx, eax xor eax, 0x200000 push eax popfd pushfd pop eax xor eax, ebx pop edi pop esi pop ebp pop ebx popfd ret %endif x264-master/common/x86/dct-32.asm000066400000000000000000000404701502133446700164770ustar00rootroot00000000000000;***************************************************************************** ;* dct-32.asm: x86_32 transform and zigzag ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Laurent Aimar ;* Min Chen ;* Christian Heine ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text cextern pd_32 cextern pw_pixel_max cextern pw_2 cextern pw_m2 cextern pw_32 cextern hsub_mul %macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets %xdefine %%base %1 %rep %0/2 %xdefine %%tmp m%2 %rotate %0/2 mova [%%base + %2*16], %%tmp %rotate 1-%0/2 %endrep %endmacro %macro UNSPILL_SHUFFLE 3-* %xdefine %%base %1 %rep %0/2 %xdefine %%tmp m%2 %rotate %0/2 mova %%tmp, [%%base + %2*16] %rotate 1-%0/2 %endrep %endmacro %macro SPILL 2+ ; assume offsets are the same as reg numbers SPILL_SHUFFLE %1, %2, %2 %endmacro %macro UNSPILL 2+ UNSPILL_SHUFFLE %1, %2, %2 %endmacro ; in: size, m0..m7 ; out: 0,4,6 in memory at %10,%11,%12, rest in regs %macro DCT8_1D 12 SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07 SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16 SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25 SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34 SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2 SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3 SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4 mova %10, m%7 mova %11, m%6 psra%1 m%7, m%8, 1 ; a3>>1 padd%1 m%7, m%9 ; a2 + (a3>>1) psra%1 m%9, 1 ; a2>>1 psub%1 m%9, m%8 ; (a2>>1) - a3 mova %12, m%9 psra%1 m%6, m%4, 1 padd%1 m%6, m%4 ; d25+(d25>>1) psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1)) psub%1 m%8, m%6 psra%1 m%6, m%3, 1 padd%1 m%6, m%3 ; d16+(d16>>1) padd%1 m%9, m%2, m%5 psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1)) psra%1 m%6, m%2, 1 padd%1 m%6, m%2 ; d07+(d07>>1) padd%1 m%6, m%3 padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1)) psra%1 m%2, m%5, 1 padd%1 m%2, m%5 ; d34+(d34>>1) padd%1 m%2, m%3 psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1)) psra%1 m%5, m%2, 2 padd%1 m%5, m%6 ; a4 + (a7>>2) psra%1 m%4, m%9, 2 padd%1 m%4, m%8 ; a5 + (a6>>2) psra%1 m%6, 2 psra%1 m%8, 2 psub%1 m%6, m%2 ; (a4>>2) - a7 psub%1 m%9, m%8 ; a6 - (a5>>2) SWAP %3, %5, %4, %7, %9, %6 %endmacro ; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11 ; out: m0..m7 %macro IDCT8_1D 11 psra%1 m%2, m%4, 1 psra%1 m%6, m%8, 1 psub%1 m%2, m%8 padd%1 m%6, m%4 psra%1 m%8, m%3, 1 padd%1 m%8, m%3 padd%1 m%8, m%5 padd%1 m%8, m%7 psra%1 m%4, m%7, 1 padd%1 m%4, m%7 padd%1 m%4, m%9 psub%1 m%4, m%3 psub%1 m%3, m%5 psub%1 m%7, m%5 padd%1 m%3, m%9 psub%1 m%7, m%9 psra%1 m%5, 1 psra%1 m%9, 1 psub%1 m%3, m%5 psub%1 m%7, m%9 psra%1 m%5, m%8, 2 psra%1 m%9, m%4, 2 padd%1 m%5, m%7 padd%1 m%9, m%3 psra%1 m%7, 2 psra%1 m%3, 2 psub%1 m%8, m%7 psub%1 m%3, m%4 mova m%4, %10 mova m%7, %11 SUMSUB_BA %1, %7, %4 SUMSUB_BA %1, %6, %7 SUMSUB_BA %1, %2, %4 SUMSUB_BA %1, %8, %6 SUMSUB_BA %1, %3, %2 SUMSUB_BA %1, %9, %4 SUMSUB_BA %1, %5, %7 SWAP %2, %4 SWAP %6, %8 SWAP %2, %6, %7 SWAP %4, %9, %8 %endmacro %if HIGH_BIT_DEPTH %macro SUB8x8_DCT8 0 cglobal sub8x8_dct8, 3,3,8 cglobal_label .skip_prologue LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2 LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2 DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50] mova m0, [r0] mova [r0+0x30], m5 mova [r0+0x70], m7 TRANSPOSE4x4W 0,1,2,3,4 WIDEN_SXWD 0,4 WIDEN_SXWD 1,5 WIDEN_SXWD 2,6 WIDEN_SXWD 3,7 DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0] mova [r0+0x20], m4 mova [r0+0x40], m1 mova [r0+0x60], m5 mova [r0+0xA0], m6 mova [r0+0xE0], m7 mova m4, [r0+0x10] mova m5, [r0+0x30] mova m6, [r0+0x50] mova m7, [r0+0x70] TRANSPOSE4x4W 4,5,6,7,0 WIDEN_SXWD 4,0 WIDEN_SXWD 5,1 WIDEN_SXWD 6,2 WIDEN_SXWD 7,3 DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0] mova [r0+0x30], m0 mova [r0+0x50], m5 mova [r0+0x70], m1 mova [r0+0xB0], m2 mova [r0+0xF0], m3 ret %endmacro ; SUB8x8_DCT8 INIT_XMM sse2 SUB8x8_DCT8 INIT_XMM sse4 SUB8x8_DCT8 INIT_XMM avx SUB8x8_DCT8 %macro ADD8x8_IDCT8 0 cglobal add8x8_idct8, 2,2 add r1, 128 cglobal_label .skip_prologue UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6 IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0] mova [r1+0], m4 TRANSPOSE4x4D 0,1,2,3,4 paddd m0, [pd_32] mova m4, [r1+0] SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2 TRANSPOSE4x4D 4,5,6,7,3 paddd m4, [pd_32] SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6 UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7 IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16] mova [r1+16], m4 TRANSPOSE4x4D 0,1,2,3,4 mova m4, [r1+16] mova [r1-112], m0 TRANSPOSE4x4D 4,5,6,7,0 SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7 UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2 IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112] SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1 UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7 IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16] SPILL_SHUFFLE r1, 7,6,5, 7,6,5 mova m7, [pw_pixel_max] pxor m6, m6 mova m5, [r1-128] STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB] mova m0, [r1-112] STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB] mova m0, [r1-96] STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB] mova m0, [r1-80] STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB] mova m0, [r1-64] STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB] mova m0, [r1-48] mova m1, [r1+80] STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB] mova m0, [r1-32] mova m1, [r1+96] STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB] mova m0, [r1-16] mova m1, [r1+112] STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB] RET %endmacro ; ADD8x8_IDCT8 INIT_XMM sse2 ADD8x8_IDCT8 INIT_XMM avx ADD8x8_IDCT8 %else ; !HIGH_BIT_DEPTH INIT_MMX ALIGN 16 load_diff_4x8_mmx: LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE] LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE] LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE] movq [r0], m0 LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE] movq m0, [r0] ret cglobal dct8_mmx DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60] SAVE_MM_PERMUTATION ret ;----------------------------------------------------------------------------- ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal sub8x8_dct8_mmx, 3,3 global sub8x8_dct8_mmx.skip_prologue .skip_prologue: RESET_MM_PERMUTATION call load_diff_4x8_mmx call dct8_mmx UNSPILL r0, 0 TRANSPOSE4x4W 0,1,2,3,4 SPILL r0, 0,1,2,3 UNSPILL r0, 4,6 TRANSPOSE4x4W 4,5,6,7,0 SPILL r0, 4,5,6,7 RESET_MM_PERMUTATION add r1, 4 add r2, 4 add r0, 8 call load_diff_4x8_mmx sub r1, 4 sub r2, 4 call dct8_mmx sub r0, 8 UNSPILL r0+8, 4,6 TRANSPOSE4x4W 4,5,6,7,0 SPILL r0+8, 4,5,6,7 UNSPILL r0+8, 0 TRANSPOSE4x4W 0,1,2,3,5 UNSPILL r0, 4,5,6,7 SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7 movq mm4, m6 ; depends on the permutation to not produce conflicts movq mm0, m4 movq mm1, m5 movq mm2, mm4 movq mm3, m7 RESET_MM_PERMUTATION UNSPILL r0+8, 4,5,6,7 add r0, 8 call dct8_mmx sub r0, 8 SPILL r0+8, 1,2,3,5,7 RESET_MM_PERMUTATION UNSPILL r0, 0,1,2,3,4,5,6,7 call dct8_mmx SPILL r0, 1,2,3,5,7 ret cglobal idct8_mmx IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64] SAVE_MM_PERMUTATION ret %macro ADD_STORE_ROW 3 movq m1, [r0+%1*FDEC_STRIDE] punpckhbw m2, m1, m0 punpcklbw m1, m0 paddw m1, %2 paddw m2, %3 packuswb m1, m2 movq [r0+%1*FDEC_STRIDE], m1 %endmacro ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- cglobal add8x8_idct8_mmx, 2,2 global add8x8_idct8_mmx.skip_prologue .skip_prologue: INIT_MMX add word [r1], 32 UNSPILL r1, 1,2,3,5,6,7 call idct8_mmx SPILL r1, 7 TRANSPOSE4x4W 0,1,2,3,7 SPILL r1, 0,1,2,3 UNSPILL r1, 7 TRANSPOSE4x4W 4,5,6,7,0 SPILL r1, 4,5,6,7 INIT_MMX UNSPILL r1+8, 1,2,3,5,6,7 add r1, 8 call idct8_mmx sub r1, 8 SPILL r1+8, 7 TRANSPOSE4x4W 0,1,2,3,7 SPILL r1+8, 0,1,2,3 UNSPILL r1+8, 7 TRANSPOSE4x4W 4,5,6,7,0 SPILL r1+8, 4,5,6,7 INIT_MMX movq m3, [r1+0x08] movq m0, [r1+0x40] movq [r1+0x40], m3 movq [r1+0x08], m0 ; memory layout at this time: ; A0------ A1------ ; B0------ F0------ ; C0------ G0------ ; D0------ H0------ ; E0------ E1------ ; B1------ F1------ ; C1------ G1------ ; D1------ H1------ UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7 UNSPILL r1+8, 5,6,7 add r1, 8 call idct8_mmx sub r1, 8 psraw m0, 6 psraw m1, 6 psraw m2, 6 psraw m3, 6 psraw m4, 6 psraw m5, 6 psraw m6, 6 psraw m7, 6 movq [r1+0x08], m0 ; mm4 movq [r1+0x48], m4 ; mm5 movq [r1+0x58], m5 ; mm0 movq [r1+0x68], m6 ; mm2 movq [r1+0x78], m7 ; mm6 movq mm5, [r1+0x18] movq mm6, [r1+0x28] movq [r1+0x18], m1 ; mm1 movq [r1+0x28], m2 ; mm7 movq mm7, [r1+0x38] movq [r1+0x38], m3 ; mm3 movq mm1, [r1+0x10] movq mm2, [r1+0x20] movq mm3, [r1+0x30] call idct8_mmx psraw m0, 6 psraw m1, 6 psraw m2, 6 psraw m3, 6 psraw m4, 6 psraw m5, 6 psraw m6, 6 psraw m7, 6 SPILL r1, 0,1,2 pxor m0, m0 ADD_STORE_ROW 0, [r1+0x00], [r1+0x08] ADD_STORE_ROW 1, [r1+0x10], [r1+0x18] ADD_STORE_ROW 2, [r1+0x20], [r1+0x28] ADD_STORE_ROW 3, m3, [r1+0x38] ADD_STORE_ROW 4, m4, [r1+0x48] ADD_STORE_ROW 5, m5, [r1+0x58] ADD_STORE_ROW 6, m6, [r1+0x68] ADD_STORE_ROW 7, m7, [r1+0x78] ret %macro DCT_SUB8 0 cglobal sub8x8_dct, 3,3 add r2, 4*FDEC_STRIDE cglobal_label .skip_prologue %if cpuflag(ssse3) mova m7, [hsub_mul] %endif LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE SPILL r0, 1,2 SWAP 2, 7 LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE UNSPILL r0, 1 SPILL r0, 7 SWAP 2, 7 UNSPILL r0, 2 DCT4_1D 0, 1, 2, 3, 7 TRANSPOSE2x4x4W 0, 1, 2, 3, 7 UNSPILL r0, 7 SPILL r0, 2 DCT4_1D 4, 5, 6, 7, 2 TRANSPOSE2x4x4W 4, 5, 6, 7, 2 UNSPILL r0, 2 SPILL r0, 6 DCT4_1D 0, 1, 2, 3, 6 UNSPILL r0, 6 STORE_DCT 0, 1, 2, 3, r0, 0 DCT4_1D 4, 5, 6, 7, 3 STORE_DCT 4, 5, 6, 7, r0, 64 ret ;----------------------------------------------------------------------------- ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal sub8x8_dct8, 3,3 add r2, 4*FDEC_STRIDE cglobal_label .skip_prologue %if cpuflag(ssse3) mova m7, [hsub_mul] LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE SPILL r0, 0,1 SWAP 1, 7 LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE UNSPILL r0, 0,1 %else LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE] LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE] LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE] LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE] LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE] SPILL r0, 0 LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE] LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE] UNSPILL r0, 0 %endif DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60] UNSPILL r0, 0,4 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1 UNSPILL r0, 4 DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60] SPILL r0, 1,2,3,5,7 ret %endmacro INIT_XMM sse2 %define movdqa movaps %define punpcklqdq movlhps DCT_SUB8 %undef movdqa %undef punpcklqdq INIT_XMM ssse3 DCT_SUB8 INIT_XMM avx DCT_SUB8 INIT_XMM xop DCT_SUB8 ;----------------------------------------------------------------------------- ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD8x8 0 cglobal add8x8_idct, 2,2 add r0, 4*FDEC_STRIDE cglobal_label .skip_prologue UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3 SBUTTERFLY qdq, 0, 1, 4 SBUTTERFLY qdq, 2, 3, 4 UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7 SPILL r1, 0 SBUTTERFLY qdq, 4, 5, 0 SBUTTERFLY qdq, 6, 7, 0 UNSPILL r1,0 IDCT4_1D w,0,1,2,3,r1 SPILL r1, 4 TRANSPOSE2x4x4W 0,1,2,3,4 UNSPILL r1, 4 IDCT4_1D w,4,5,6,7,r1 SPILL r1, 0 TRANSPOSE2x4x4W 4,5,6,7,0 UNSPILL r1, 0 paddw m0, [pw_32] IDCT4_1D w,0,1,2,3,r1 paddw m4, [pw_32] IDCT4_1D w,4,5,6,7,r1 SPILL r1, 6,7 pxor m7, m7 DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5 DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5 UNSPILL_SHUFFLE r1, 0,2, 6,7 DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5 DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5 STORE_IDCT m1, m3, m5, m2 ret %endmacro ; ADD8x8 INIT_XMM sse2 ADD8x8 INIT_XMM avx ADD8x8 ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- %macro ADD8x8_IDCT8 0 cglobal add8x8_idct8, 2,2 add r0, 4*FDEC_STRIDE cglobal_label .skip_prologue UNSPILL r1, 1,2,3,5,6,7 IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64] SPILL r1, 6 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1 paddw m0, [pw_32] SPILL r1, 0 IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64] SPILL r1, 6,7 pxor m7, m7 DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5 DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5 UNSPILL_SHUFFLE r1, 0,2, 6,7 DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5 DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5 STORE_IDCT m1, m3, m5, m2 ret %endmacro ; ADD8x8_IDCT8 INIT_XMM sse2 ADD8x8_IDCT8 INIT_XMM avx ADD8x8_IDCT8 %endif ; !HIGH_BIT_DEPTH x264-master/common/x86/dct-64.asm000066400000000000000000000264141502133446700165060ustar00rootroot00000000000000;***************************************************************************** ;* dct-64.asm: x86_64 transform and zigzag ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Laurent Aimar ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text cextern pd_32 cextern pw_pixel_max cextern pw_2 cextern pw_m2 cextern pw_32 cextern hsub_mul ; in: size, m0..m7, temp, temp ; out: m0..m7 %macro DCT8_1D 11 SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34 SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25 SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16 SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07 SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3 SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2 psra%1 m%10, m%2, 1 padd%1 m%10, m%2 padd%1 m%10, m%3 padd%1 m%10, m%4 ; %10=a4 psra%1 m%11, m%5, 1 padd%1 m%11, m%5 padd%1 m%11, m%3 psub%1 m%11, m%4 ; %11=a7 SUMSUB_BA %1, %5, %2 psub%1 m%2, m%4 psub%1 m%5, m%3 psra%1 m%4, 1 psra%1 m%3, 1 psub%1 m%2, m%4 ; %2=a5 psub%1 m%5, m%3 ; %5=a6 psra%1 m%3, m%11, 2 padd%1 m%3, m%10 ; %3=b1 psra%1 m%10, 2 psub%1 m%10, m%11 ; %10=b7 SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4 psra%1 m%4, m%8, 1 padd%1 m%4, m%9 ; %4=b2 psra%1 m%9, 1 psub%1 m%9, m%8 ; %9=b6 psra%1 m%8, m%5, 2 padd%1 m%8, m%2 ; %8=b3 psra%1 m%2, 2 psub%1 m%5, m%2 ; %5=b5 SWAP %2, %7, %5, %8, %9, %10 %endmacro %macro IDCT8_1D 11 SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2 psra%1 m%10, m%3, 1 padd%1 m%10, m%3 padd%1 m%10, m%5 padd%1 m%10, m%7 ; %9=a7 psra%1 m%11, m%4, 1 psub%1 m%11, m%8 ; %10=a4 psra%1 m%8, 1 padd%1 m%8, m%4 ; %7=a6 psra%1 m%4, m%7, 1 padd%1 m%4, m%7 padd%1 m%4, m%9 psub%1 m%4, m%3 ; %3=a5 psub%1 m%3, m%5 psub%1 m%7, m%5 padd%1 m%3, m%9 psub%1 m%7, m%9 psra%1 m%5, 1 psra%1 m%9, 1 psub%1 m%3, m%5 ; %2=a3 psub%1 m%7, m%9 ; %6=a1 psra%1 m%5, m%10, 2 padd%1 m%5, m%7 ; %4=b1 psra%1 m%7, 2 psub%1 m%10, m%7 ; %9=b7 SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6 SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4 psra%1 m%9, m%4, 2 padd%1 m%9, m%3 ; %8=b3 psra%1 m%3, 2 psub%1 m%3, m%4 ; %2=b5 SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7 SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6 SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5 SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4 SWAP %11, %4 SWAP %2, %10, %7 SWAP %4, %9, %8 %endmacro %if HIGH_BIT_DEPTH %macro SUB8x8_DCT8 0 cglobal sub8x8_dct8, 3,3,14 TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2 LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2 DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9 TRANSPOSE4x4W 0,1,2,3,8 WIDEN_SXWD 0,8 WIDEN_SXWD 1,9 WIDEN_SXWD 2,10 WIDEN_SXWD 3,11 DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13 mova [r0+0x00], m0 mova [r0+0x20], m8 mova [r0+0x40], m1 mova [r0+0x60], m9 mova [r0+0x80], m2 mova [r0+0xA0], m10 mova [r0+0xC0], m3 mova [r0+0xE0], m11 TRANSPOSE4x4W 4,5,6,7,0 WIDEN_SXWD 4,0 WIDEN_SXWD 5,1 WIDEN_SXWD 6,2 WIDEN_SXWD 7,3 DCT8_1D d,4,0,5,1,6,2,7,3, 8,9 mova [r0+0x10], m4 mova [r0+0x30], m0 mova [r0+0x50], m5 mova [r0+0x70], m1 mova [r0+0x90], m6 mova [r0+0xB0], m2 mova [r0+0xD0], m7 mova [r0+0xF0], m3 ret %endmacro ; SUB8x8_DCT8 INIT_XMM sse2 SUB8x8_DCT8 INIT_XMM sse4 SUB8x8_DCT8 INIT_XMM avx SUB8x8_DCT8 %macro ADD8x8_IDCT8 0 cglobal add8x8_idct8, 2,2,16 add r1, 128 TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue mova m0, [r1-128] mova m1, [r1-96] mova m2, [r1-64] mova m3, [r1-32] mova m4, [r1+ 0] mova m5, [r1+32] mova m6, [r1+64] mova m7, [r1+96] IDCT8_1D d,0,1,2,3,4,5,6,7,8,9 TRANSPOSE4x4D 0,1,2,3,8 TRANSPOSE4x4D 4,5,6,7,8 paddd m0, [pd_32] paddd m4, [pd_32] mova [r1+64], m6 mova [r1+96], m7 mova m8, [r1-112] mova m9, [r1-80] mova m10, [r1-48] mova m11, [r1-16] mova m12, [r1+16] mova m13, [r1+48] mova m14, [r1+80] mova m15, [r1+112] IDCT8_1D d,8,9,10,11,12,13,14,15,6,7 TRANSPOSE4x4D 8,9,10,11,6 TRANSPOSE4x4D 12,13,14,15,6 IDCT8_1D d,0,1,2,3,8,9,10,11,6,7 mova [r1-112], m8 mova [r1-80], m9 mova m6, [r1+64] mova m7, [r1+96] IDCT8_1D d,4,5,6,7,12,13,14,15,8,9 pxor m8, m8 mova m9, [pw_pixel_max] STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB] STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB] STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB] STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB] mova m0, [r1-112] mova m1, [r1-80] STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB] STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB] STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB] STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB] ret %endmacro ; ADD8x8_IDCT8 INIT_XMM sse2 ADD8x8_IDCT8 INIT_XMM avx ADD8x8_IDCT8 %else ; !HIGH_BIT_DEPTH %macro DCT_SUB8 0 cglobal sub8x8_dct, 3,3,10 add r2, 4*FDEC_STRIDE %if cpuflag(ssse3) mova m7, [hsub_mul] %endif TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue SWAP 7, 9 LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE DCT4_1D 0, 1, 2, 3, 8 TRANSPOSE2x4x4W 0, 1, 2, 3, 8 DCT4_1D 4, 5, 6, 7, 8 TRANSPOSE2x4x4W 4, 5, 6, 7, 8 DCT4_1D 0, 1, 2, 3, 8 STORE_DCT 0, 1, 2, 3, r0, 0 DCT4_1D 4, 5, 6, 7, 8 STORE_DCT 4, 5, 6, 7, r0, 64 ret ;----------------------------------------------------------------------------- ; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- cglobal sub8x8_dct8, 3,3,11 add r2, 4*FDEC_STRIDE %if cpuflag(ssse3) mova m7, [hsub_mul] %endif TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue SWAP 7, 10 LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 movdqa [r0+0x00], m0 movdqa [r0+0x10], m1 movdqa [r0+0x20], m2 movdqa [r0+0x30], m3 movdqa [r0+0x40], m4 movdqa [r0+0x50], m5 movdqa [r0+0x60], m6 movdqa [r0+0x70], m7 ret %endmacro INIT_XMM sse2 %define movdqa movaps %define punpcklqdq movlhps DCT_SUB8 %undef movdqa %undef punpcklqdq INIT_XMM ssse3 DCT_SUB8 INIT_XMM avx DCT_SUB8 INIT_XMM xop DCT_SUB8 INIT_YMM avx2 cglobal sub16x16_dct8, 3,3,10 add r0, 128 add r2, 4*FDEC_STRIDE call .sub16x8_dct8 add r0, 256 add r1, FENC_STRIDE*8 add r2, FDEC_STRIDE*8 call .sub16x8_dct8 RET .sub16x8_dct8: LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1 LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5 LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 DCT8_1D w, 0,1,2,3,4,5,6,7,8,9 mova [r0-0x80+0x00], xm0 vextracti128 [r0+0x00], m0, 1 mova [r0-0x80+0x10], xm1 vextracti128 [r0+0x10], m1, 1 mova [r0-0x80+0x20], xm2 vextracti128 [r0+0x20], m2, 1 mova [r0-0x80+0x30], xm3 vextracti128 [r0+0x30], m3, 1 mova [r0-0x80+0x40], xm4 vextracti128 [r0+0x40], m4, 1 mova [r0-0x80+0x50], xm5 vextracti128 [r0+0x50], m5, 1 mova [r0-0x80+0x60], xm6 vextracti128 [r0+0x60], m6, 1 mova [r0-0x80+0x70], xm7 vextracti128 [r0+0x70], m7, 1 ret ;----------------------------------------------------------------------------- ; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] ) ;----------------------------------------------------------------------------- %macro ADD8x8_IDCT8 0 cglobal add8x8_idct8, 2,2,11 add r0, 4*FDEC_STRIDE pxor m7, m7 TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue SWAP 7, 9 movdqa m0, [r1+0x00] movdqa m1, [r1+0x10] movdqa m2, [r1+0x20] movdqa m3, [r1+0x30] movdqa m4, [r1+0x40] movdqa m5, [r1+0x50] movdqa m6, [r1+0x60] movdqa m7, [r1+0x70] IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8 paddw m0, [pw_32] ; rounding for the >>6 at the end IDCT8_1D w,0,1,2,3,4,5,6,7,8,10 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] STORE_IDCT m1, m3, m5, m7 ret %endmacro ; ADD8x8_IDCT8 INIT_XMM sse2 ADD8x8_IDCT8 INIT_XMM avx ADD8x8_IDCT8 ;----------------------------------------------------------------------------- ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD8x8 0 cglobal add8x8_idct, 2,2,11 add r0, 4*FDEC_STRIDE pxor m7, m7 TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue SWAP 7, 9 mova m0, [r1+ 0] mova m2, [r1+16] mova m1, [r1+32] mova m3, [r1+48] SBUTTERFLY qdq, 0, 1, 4 SBUTTERFLY qdq, 2, 3, 4 mova m4, [r1+64] mova m6, [r1+80] mova m5, [r1+96] mova m7, [r1+112] SBUTTERFLY qdq, 4, 5, 8 SBUTTERFLY qdq, 6, 7, 8 IDCT4_1D w,0,1,2,3,8,10 TRANSPOSE2x4x4W 0,1,2,3,8 IDCT4_1D w,4,5,6,7,8,10 TRANSPOSE2x4x4W 4,5,6,7,8 paddw m0, [pw_32] IDCT4_1D w,0,1,2,3,8,10 paddw m4, [pw_32] IDCT4_1D w,4,5,6,7,8,10 DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE] DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE] DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE] DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE] STORE_IDCT m1, m3, m5, m7 ret %endmacro ; ADD8x8 INIT_XMM sse2 ADD8x8 INIT_XMM avx ADD8x8 %endif ; !HIGH_BIT_DEPTH x264-master/common/x86/dct-a.asm000066400000000000000000002064071502133446700164770ustar00rootroot00000000000000;***************************************************************************** ;* dct-a.asm: x86 transform and zigzag ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Holger Lubitz ;* Loren Merritt ;* Laurent Aimar ;* Min Chen ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 64 ; AVX-512 permutation indices are bit-packed to save cache %if HIGH_BIT_DEPTH scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1 dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2 dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3 ; bits 19-23: 8x8_frame4 scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1 dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2 dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3 dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4 cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1 dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2 dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3 dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4 %else dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2 dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1 dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2 dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30 scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1 dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2 dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2 cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1 dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2 dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd %endif pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1 pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5 pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7 pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9 pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15 pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14 pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14 pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13 pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15 pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14 pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15 pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15 pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14 pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15 pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13 pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14 pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9 pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2 pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7 pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3 pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11 pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7 pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10 pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80 pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8 pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80 pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2 pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80 pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12 pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13 SECTION .text cextern pw_32_0 cextern pw_32 cextern pw_512 cextern pw_8000 cextern pw_pixel_max cextern hsub_mul cextern pb_1 cextern pw_1 cextern pd_1 cextern pd_32 cextern pw_ppppmmmm cextern pw_pmpmpmpm cextern deinterleave_shufd cextern pb_unpackbd1 cextern pb_unpackbd2 %macro WALSH4_1D 6 SUMSUB_BADC %1, %5, %4, %3, %2, %6 SUMSUB_BADC %1, %5, %3, %4, %2, %6 SWAP %2, %5, %4 %endmacro %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000 movq m%3, m%4 pxor m%1, m%4 psubw m%3, m%2 pxor m%2, m%4 pavgw m%3, m%1 pavgw m%2, m%1 pxor m%3, m%4 pxor m%2, m%4 SWAP %1, %2, %3 %endmacro %macro DCT_UNPACK 3 punpcklwd %3, %1 punpckhwd %2, %1 psrad %3, 16 psrad %2, 16 SWAP %1, %3 %endmacro %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void dct4x4dc( dctcoef d[4][4] ) ;----------------------------------------------------------------------------- %macro DCT4x4_DC 0 cglobal dct4x4dc, 1,1,5 mova m0, [r0+ 0] mova m1, [r0+16] mova m2, [r0+32] mova m3, [r0+48] WALSH4_1D d, 0,1,2,3,4 TRANSPOSE4x4D 0,1,2,3,4 paddd m0, [pd_1] WALSH4_1D d, 0,1,2,3,4 psrad m0, 1 psrad m1, 1 psrad m2, 1 psrad m3, 1 mova [r0+ 0], m0 mova [r0+16], m1 mova [r0+32], m2 mova [r0+48], m3 RET %endmacro ; DCT4x4_DC INIT_XMM sse2 DCT4x4_DC INIT_XMM avx DCT4x4_DC %else INIT_MMX mmx2 cglobal dct4x4dc, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] movq m0, [r0+ 0] movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works WALSH4_1D w, 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 SUMSUB_BADC w, 1, 0, 3, 2, 4 SWAP 0, 1 SWAP 2, 3 SUMSUB_17BIT 0,2,4,7 SUMSUB_17BIT 1,3,5,7 movq [r0+0], m0 movq [r0+8], m2 movq [r0+16], m3 movq [r0+24], m1 RET %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void idct4x4dc( int32_t d[4][4] ) ;----------------------------------------------------------------------------- %macro IDCT4x4DC 0 cglobal idct4x4dc, 1,1 mova m3, [r0+48] mova m2, [r0+32] mova m1, [r0+16] mova m0, [r0+ 0] WALSH4_1D d,0,1,2,3,4 TRANSPOSE4x4D 0,1,2,3,4 WALSH4_1D d,0,1,2,3,4 mova [r0+ 0], m0 mova [r0+16], m1 mova [r0+32], m2 mova [r0+48], m3 RET %endmacro ; IDCT4x4DC INIT_XMM sse2 IDCT4x4DC INIT_XMM avx IDCT4x4DC %else ;----------------------------------------------------------------------------- ; void idct4x4dc( int16_t d[4][4] ) ;----------------------------------------------------------------------------- INIT_MMX mmx cglobal idct4x4dc, 1,1 movq m3, [r0+24] movq m2, [r0+16] movq m1, [r0+ 8] movq m0, [r0+ 0] WALSH4_1D w,0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 WALSH4_1D w,0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] ) ;----------------------------------------------------------------------------- %if WIN64 DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size %else DECLARE_REG_TMP 2 %endif %macro INSERT_COEFF 3 ; dst, src, imm %if %3 %if HIGH_BIT_DEPTH %if cpuflag(sse4) pinsrd %1, %2, %3 %elif %3 == 2 movd m2, %2 %elif %3 == 1 punpckldq %1, %2 %else punpckldq m2, %2 punpcklqdq %1, m2 %endif %else %if %3 == 2 punpckldq %1, %2 %else pinsrw %1, %2, %3 %endif %endif %else movd %1, %2 %endif %if HIGH_BIT_DEPTH mov %2, t0d %else mov %2, t0w %endif %endmacro %macro DCT2x4DC 2 cglobal dct2x4dc, 2,3 xor t0d, t0d INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0 INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2 add r1, 4*16*SIZEOF_DCTCOEF INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1 INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3 INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0 INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2 INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1 INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3 SUMSUB_BA %1, 1, 0, 2 SBUTTERFLY %2, 1, 0, 2 SUMSUB_BA %1, 0, 1, 2 SBUTTERFLY %2, 0, 1, 2 SUMSUB_BA %1, 1, 0, 2 pshuf%1 m0, m0, q1032 mova [r0], m1 mova [r0+mmsize], m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 DCT2x4DC d, dq INIT_XMM avx DCT2x4DC d, dq %else INIT_MMX mmx2 DCT2x4DC w, wd %endif %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 ) ;----------------------------------------------------------------------------- INIT_MMX mmx cglobal sub4x4_dct, 3,3 .skip_prologue: LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE] LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE] LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE] LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE] DCT4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 SUMSUB_BADC w, 3, 0, 2, 1 SUMSUB_BA w, 2, 3, 4 DCT_UNPACK m2, m4, m5 DCT_UNPACK m3, m6, m7 mova [r0+ 0], m2 ; s03 + s12 mova [r0+ 8], m4 mova [r0+32], m3 ; s03 - s12 mova [r0+40], m6 DCT_UNPACK m0, m2, m4 DCT_UNPACK m1, m3, m5 SUMSUB2_AB d, 0, 1, 4 SUMSUB2_AB d, 2, 3, 5 mova [r0+16], m0 ; d03*2 + d12 mova [r0+24], m2 mova [r0+48], m4 ; d03 - 2*d12 mova [r0+56], m5 RET %else %macro SUB_DCT4 0 cglobal sub4x4_dct, 3,3 .skip_prologue: %if cpuflag(ssse3) mova m5, [hsub_mul] %endif LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2 DCT4_1D 0,1,2,3,4 TRANSPOSE4x4W 0,1,2,3,4 DCT4_1D 0,1,2,3,4 movq [r0+ 0], m0 movq [r0+ 8], m1 movq [r0+16], m2 movq [r0+24], m3 RET %endmacro INIT_MMX mmx SUB_DCT4 INIT_MMX ssse3 SUB_DCT4 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] ) ;----------------------------------------------------------------------------- %macro STORE_DIFFx2 6 psrad %1, 6 psrad %2, 6 packssdw %1, %2 movq %3, %5 movhps %3, %6 paddsw %1, %3 CLIPW %1, %4, [pw_pixel_max] movq %5, %1 movhps %6, %1 %endmacro %macro ADD4x4_IDCT 0 cglobal add4x4_idct, 2,2,6 add r0, 2*FDEC_STRIDEB .skip_prologue: mova m1, [r1+16] mova m3, [r1+48] mova m2, [r1+32] mova m0, [r1+ 0] IDCT4_1D d,0,1,2,3,4,5 TRANSPOSE4x4D 0,1,2,3,4 paddd m0, [pd_32] IDCT4_1D d,0,1,2,3,4,5 pxor m5, m5 STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB] STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB] RET %endmacro INIT_XMM sse2 ADD4x4_IDCT INIT_XMM avx ADD4x4_IDCT %else ; !HIGH_BIT_DEPTH INIT_MMX mmx cglobal add4x4_idct, 2,2 pxor m7, m7 .skip_prologue: movq m1, [r1+ 8] movq m3, [r1+24] movq m2, [r1+16] movq m0, [r1+ 0] IDCT4_1D w,0,1,2,3,4,5 TRANSPOSE4x4W 0,1,2,3,4 paddw m0, [pw_32] IDCT4_1D w,0,1,2,3,4,5 STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE] STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE] STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE] STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE] RET %macro ADD4x4 0 cglobal add4x4_idct, 2,2,6 mova m1, [r1+0x00] ; row1/row0 mova m3, [r1+0x10] ; row3/row2 psraw m0, m1, 1 ; row1>>1/... psraw m2, m3, 1 ; row3>>1/... movsd m0, m1 ; row1>>1/row0 movsd m2, m3 ; row3>>1/row2 psubw m0, m3 ; row1>>1-row3/row0-2 paddw m2, m1 ; row3>>1+row1/row0+2 SBUTTERFLY2 wd, 0, 2, 1 SUMSUB_BA w, 2, 0, 1 pshuflw m1, m2, q2301 pshufhw m2, m2, q2301 punpckldq m1, m0 punpckhdq m2, m0 SWAP 0, 1 mova m1, [pw_32_0] paddw m1, m0 ; row1/row0 corrected psraw m0, 1 ; row1>>1/... psraw m3, m2, 1 ; row3>>1/... movsd m0, m1 ; row1>>1/row0 movsd m3, m2 ; row3>>1/row2 psubw m0, m2 ; row1>>1-row3/row0-2 paddw m3, m1 ; row3>>1+row1/row0+2 SBUTTERFLY2 qdq, 0, 3, 1 SUMSUB_BA w, 3, 0, 1 movd m4, [r0+FDEC_STRIDE*0] movd m1, [r0+FDEC_STRIDE*1] movd m2, [r0+FDEC_STRIDE*2] movd m5, [r0+FDEC_STRIDE*3] punpckldq m1, m4 ; row0/row1 pxor m4, m4 punpckldq m2, m5 ; row3/row2 punpcklbw m1, m4 psraw m3, 6 punpcklbw m2, m4 psraw m0, 6 paddsw m3, m1 paddsw m0, m2 packuswb m0, m3 ; row0/row1/row3/row2 pextrd [r0+FDEC_STRIDE*0], m0, 3 pextrd [r0+FDEC_STRIDE*1], m0, 2 movd [r0+FDEC_STRIDE*2], m0 pextrd [r0+FDEC_STRIDE*3], m0, 1 RET %endmacro ; ADD4x4 INIT_XMM sse4 ADD4x4 INIT_XMM avx ADD4x4 %macro STOREx2_AVX2 9 movq xm%3, [r0+%5*FDEC_STRIDE] vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1 movq xm%4, [r0+%7*FDEC_STRIDE] vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1 punpcklbw m%3, m%9 punpcklbw m%4, m%9 psraw m%1, 6 psraw m%2, 6 paddsw m%1, m%3 paddsw m%2, m%4 packuswb m%1, m%2 vextracti128 xm%2, m%1, 1 movq [r0+%5*FDEC_STRIDE], xm%1 movq [r0+%6*FDEC_STRIDE], xm%2 movhps [r0+%7*FDEC_STRIDE], xm%1 movhps [r0+%8*FDEC_STRIDE], xm%2 %endmacro INIT_YMM avx2 cglobal add8x8_idct, 2,3,8 add r0, 4*FDEC_STRIDE pxor m7, m7 TAIL_CALL .skip_prologue, 0 cglobal_label .skip_prologue ; TRANSPOSE4x4Q mova xm0, [r1+ 0] mova xm1, [r1+32] mova xm2, [r1+16] mova xm3, [r1+48] vinserti128 m0, m0, [r1+ 64], 1 vinserti128 m1, m1, [r1+ 96], 1 vinserti128 m2, m2, [r1+ 80], 1 vinserti128 m3, m3, [r1+112], 1 SBUTTERFLY qdq, 0, 1, 4 SBUTTERFLY qdq, 2, 3, 4 IDCT4_1D w,0,1,2,3,4,5 TRANSPOSE2x4x4W 0,1,2,3,4 paddw m0, [pw_32] IDCT4_1D w,0,1,2,3,4,5 STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7 STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7 ret ; 2xdst, 2xtmp, 4xsrcrow, 1xzero %macro LOAD_DIFF8x2_AVX2 9 movq xm%1, [r1+%5*FENC_STRIDE] movq xm%2, [r1+%6*FENC_STRIDE] vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1 vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1 punpcklbw m%1, m%9 punpcklbw m%2, m%9 movq xm%3, [r2+(%5-4)*FDEC_STRIDE] movq xm%4, [r2+(%6-4)*FDEC_STRIDE] vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1 vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1 punpcklbw m%3, m%9 punpcklbw m%4, m%9 psubw m%1, m%3 psubw m%2, m%4 %endmacro ; 4x src, 1x tmp %macro STORE8_DCT_AVX2 5 SBUTTERFLY qdq, %1, %2, %5 SBUTTERFLY qdq, %3, %4, %5 mova [r0+ 0], xm%1 mova [r0+ 16], xm%3 mova [r0+ 32], xm%2 mova [r0+ 48], xm%4 vextracti128 [r0+ 64], m%1, 1 vextracti128 [r0+ 80], m%3, 1 vextracti128 [r0+ 96], m%2, 1 vextracti128 [r0+112], m%4, 1 %endmacro %macro STORE16_DCT_AVX2 5 SBUTTERFLY qdq, %1, %2, %5 SBUTTERFLY qdq, %3, %4, %5 mova [r0+ 0-128], xm%1 mova [r0+16-128], xm%3 mova [r0+32-128], xm%2 mova [r0+48-128], xm%4 vextracti128 [r0+ 0], m%1, 1 vextracti128 [r0+16], m%3, 1 vextracti128 [r0+32], m%2, 1 vextracti128 [r0+48], m%4, 1 %endmacro INIT_YMM avx2 cglobal sub8x8_dct, 3,3,7 pxor m6, m6 add r2, 4*FDEC_STRIDE LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6 LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6 DCT4_1D 0, 1, 2, 3, 4 TRANSPOSE2x4x4W 0, 1, 2, 3, 4 DCT4_1D 0, 1, 2, 3, 4 STORE8_DCT_AVX2 0, 1, 2, 3, 4 RET INIT_YMM avx2 cglobal sub16x16_dct, 3,3,6 add r0, 128 add r2, 4*FDEC_STRIDE call .sub16x4_dct add r0, 64 add r1, 4*FENC_STRIDE add r2, 4*FDEC_STRIDE call .sub16x4_dct add r0, 256-64 add r1, 4*FENC_STRIDE add r2, 4*FDEC_STRIDE call .sub16x4_dct add r0, 64 add r1, 4*FENC_STRIDE add r2, 4*FDEC_STRIDE call .sub16x4_dct RET .sub16x4_dct: LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1 LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3 DCT4_1D 0, 1, 2, 3, 4 TRANSPOSE2x4x4W 0, 1, 2, 3, 4 DCT4_1D 0, 1, 2, 3, 4 STORE16_DCT_AVX2 0, 1, 2, 3, 4 ret %macro DCT4x4_AVX512 0 psubw m0, m2 ; 0 1 psubw m1, m3 ; 3 2 SUMSUB_BA w, 1, 0, 2 SBUTTERFLY wd, 1, 0, 2 paddw m2, m1, m0 psubw m3, m1, m0 vpaddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1 vpsubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3 shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2 punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1 SUMSUB_BA w, 1, 2, 3 shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2 shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3 paddw m2, m1, m3 psubw m0, m1, m3 vpaddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1 vpsubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3' %endmacro INIT_XMM avx512 cglobal sub4x4_dct mov eax, 0xf0aa kmovw k1, eax PROLOGUE 3,3 movd m0, [r1+0*FENC_STRIDE] movd m2, [r2+0*FDEC_STRIDE] vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE] vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE] movd m1, [r1+3*FENC_STRIDE] movd m3, [r2+3*FDEC_STRIDE] vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE] vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE] kshiftrw k2, k1, 8 pxor m4, m4 punpcklbw m0, m4 punpcklbw m2, m4 punpcklbw m1, m4 punpcklbw m3, m4 DCT4x4_AVX512 mova [r0], m2 mova [r0+16], m0 RET INIT_ZMM avx512 cglobal dct4x4x4_internal punpcklbw m0, m1, m4 punpcklbw m2, m3, m4 punpckhbw m1, m4 punpckhbw m3, m4 DCT4x4_AVX512 mova m1, m2 vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0 vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1 ret %macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2 movu %1, [r1+%3*FENC_STRIDE] vpermt2d %1, %2, [r1+%4*FENC_STRIDE] %endmacro %macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2 movu %1, [r2+(%4 )*FDEC_STRIDE] vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE] movu %3, [r2+(%5 )*FDEC_STRIDE] vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE] vpermt2d %1, %2, %3 %endmacro cglobal sub8x8_dct, 3,3 mova m0, [dct_avx512] DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3 mov r1d, 0xaaaaaaaa kmovd k1, r1d psrld m0, 5 DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4 mov r1d, 0xf0f0f0f0 kmovd k2, r1d pxor xm4, xm4 knotw k3, k2 call dct4x4x4_internal_avx512 mova [r0], m0 mova [r0+64], m1 RET %macro SUB4x16_DCT_AVX512 2 ; dst, src vpermd m1, m5, [r1+1*%2*64] mova m3, [r2+2*%2*64] vpermt2d m3, m6, [r2+2*%2*64+64] call dct4x4x4_internal_avx512 mova [r0+%1*64 ], m0 mova [r0+%1*64+128], m1 %endmacro cglobal sub16x16_dct psrld m5, [dct_avx512], 10 mov eax, 0xaaaaaaaa kmovd k1, eax mov eax, 0xf0f0f0f0 kmovd k2, eax PROLOGUE 3,3 pxor xm4, xm4 knotw k3, k2 psrld m6, m5, 4 SUB4x16_DCT_AVX512 0, 0 SUB4x16_DCT_AVX512 1, 1 SUB4x16_DCT_AVX512 4, 2 SUB4x16_DCT_AVX512 5, 3 RET cglobal sub8x8_dct_dc, 3,3 mova m3, [dct_avx512] DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3 mov r1d, 0xaa kmovb k1, r1d psrld m3, 5 DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4 pxor xm3, xm3 psadbw m0, m3 psadbw m1, m3 psubw m0, m1 vpmovqw xmm0, m0 vprold xmm1, xmm0, 16 paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3 punpckhqdq xmm2, xmm0, xmm0 psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3 paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3 punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3 punpcklqdq xmm1, xmm0, xmm0 vpsubw xmm0 {k1}, xm3, xmm0 paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3 movhps [r0], xmm0 RET cglobal sub8x16_dct_dc, 3,3 mova m5, [dct_avx512] DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5 DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7 mov r1d, 0xaa kmovb k1, r1d psrld m5, 5 DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8 DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12 pxor xm4, xm4 psadbw m0, m4 psadbw m1, m4 psadbw m2, m4 psadbw m3, m4 psubw m0, m2 psubw m1, m3 SBUTTERFLY qdq, 0, 1, 2 paddw m0, m1 vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7 psrlq xmm2, xmm0, 32 psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7 paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7 punpckhdq xmm2, xmm0, xmm1 punpckldq xmm0, xmm1 psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7 paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7 punpcklwd xmm0, xmm1 psrlq xmm2, xmm0, 32 psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7 paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7 shufps xmm0, xmm1, q0220 mova [r0], xmm0 RET %macro SARSUMSUB 3 ; a, b, tmp mova m%3, m%1 vpsraw m%1 {k1}, 1 psubw m%1, m%2 ; 0-2 1>>1-3 vpsraw m%2 {k1}, 1 paddw m%2, m%3 ; 0+2 1+3>>1 %endmacro cglobal add8x8_idct, 2,2 mova m1, [r1] mova m2, [r1+64] mova m3, [dct_avx512] vbroadcasti32x4 m4, [pw_32] mov r1d, 0xf0f0f0f0 kxnorb k2, k2, k2 kmovd k1, r1d kmovb k3, k2 vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE vpgatherqq m6 {k2}, [r0+m5] SARSUMSUB 0, 1, 2 SBUTTERFLY wd, 1, 0, 2 psrlq m7, m3, 28 SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3 vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1 SBUTTERFLY dq, 0, 1, 2 psrlq m3, 24 SARSUMSUB 0, 1, 2 vpermi2q m3, m1, m0 vpermt2q m1, m7, m0 paddw m3, m4 ; += 32 SUMSUB_BA w, 1, 3, 0 psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3' psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3' pxor xm0, xm0 SBUTTERFLY bw, 6, 0, 2 paddsw m1, m6 paddsw m3, m0 packuswb m1, m3 vpscatterqq [r0+m5] {k3}, m1 RET %endif ; HIGH_BIT_DEPTH INIT_MMX ;----------------------------------------------------------------------------- ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro SUB_NxN_DCT 7 cglobal %1, 3,3,%7 %if HIGH_BIT_DEPTH == 0 %if mmsize == 8 pxor m7, m7 %else add r2, 4*FDEC_STRIDE mova m7, [hsub_mul] %endif %endif ; !HIGH_BIT_DEPTH .skip_prologue: call %2.skip_prologue add r0, %3 add r1, %4-%5-%6*FENC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE call %2.skip_prologue add r0, %3 add r1, (%4-%6)*FENC_STRIDE-%5-%4 add r2, (%4-%6)*FDEC_STRIDE-%5-%4 call %2.skip_prologue add r0, %3 add r1, %4-%5-%6*FENC_STRIDE add r2, %4-%5-%6*FDEC_STRIDE TAIL_CALL %2.skip_prologue, 1 %endmacro ;----------------------------------------------------------------------------- ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] ) ;----------------------------------------------------------------------------- %macro ADD_NxN_IDCT 6-7 %if HIGH_BIT_DEPTH cglobal %1, 2,2,%7 %if %3==256 add r1, 128 %endif %else cglobal %1, 2,2,11 pxor m7, m7 %endif %if mmsize>=16 && %3!=256 add r0, 4*FDEC_STRIDE %endif .skip_prologue: call %2.skip_prologue add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 call %2.skip_prologue add r0, (%4-%6)*FDEC_STRIDE-%5-%4 add r1, %3 call %2.skip_prologue add r0, %4-%5-%6*FDEC_STRIDE add r1, %3 TAIL_CALL %2.skip_prologue, 1 %endmacro %if HIGH_BIT_DEPTH INIT_MMX SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0 INIT_XMM ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6 ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6 ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6 cextern add8x8_idct8_sse2.skip_prologue cextern add8x8_idct8_avx.skip_prologue ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16 cextern sub8x8_dct8_sse2.skip_prologue cextern sub8x8_dct8_sse4.skip_prologue cextern sub8x8_dct8_avx.skip_prologue SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14 SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14 %else ; !HIGH_BIT_DEPTH %if ARCH_X86_64 == 0 INIT_MMX SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0 ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0 SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0 ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4 cextern sub8x8_dct8_mmx.skip_prologue cextern add8x8_idct8_mmx.skip_prologue SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0 ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0 %endif INIT_XMM cextern sub8x8_dct_sse2.skip_prologue cextern sub8x8_dct_ssse3.skip_prologue cextern sub8x8_dct_avx.skip_prologue cextern sub8x8_dct_xop.skip_prologue SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10 SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10 SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10 SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10 cextern add8x8_idct_sse2.skip_prologue cextern add8x8_idct_avx.skip_prologue ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0 ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0 cextern add8x8_idct8_sse2.skip_prologue cextern add8x8_idct8_avx.skip_prologue ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0 ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0 cextern sub8x8_dct8_sse2.skip_prologue cextern sub8x8_dct8_ssse3.skip_prologue cextern sub8x8_dct8_avx.skip_prologue SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11 SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11 SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11 INIT_YMM ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 ) ;----------------------------------------------------------------------------- %macro ADD_DC 2 mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels mova m1, [%1+FDEC_STRIDEB*1] mova m2, [%1+FDEC_STRIDEB*2] paddsw m0, %2 paddsw m1, %2 paddsw m2, %2 paddsw %2, [%1+FDEC_STRIDEB*3] CLIPW m0, m5, m6 CLIPW m1, m5, m6 CLIPW m2, m5, m6 CLIPW %2, m5, m6 mova [%1+FDEC_STRIDEB*0], m0 mova [%1+FDEC_STRIDEB*1], m1 mova [%1+FDEC_STRIDEB*2], m2 mova [%1+FDEC_STRIDEB*3], %2 %endmacro %macro ADD_IDCT_DC 0 cglobal add8x8_idct_dc, 2,2,7 mova m6, [pw_pixel_max] pxor m5, m5 mova m3, [r1] paddd m3, [pd_32] psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _ pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3 ADD_DC r0+FDEC_STRIDEB*0, m4 ADD_DC r0+FDEC_STRIDEB*4, m3 RET cglobal add16x16_idct_dc, 2,3,8 mov r2, 4 mova m6, [pw_pixel_max] mova m7, [pd_32] pxor m5, m5 .loop: mova m3, [r1] paddd m3, m7 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0 pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _ pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3 pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1 pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3 ADD_DC r0+FDEC_STRIDEB*0, m4 ADD_DC r0+SIZEOF_PIXEL*8, m3 add r1, 16 add r0, 4*FDEC_STRIDEB dec r2 jg .loop RET %endmacro ; ADD_IDCT_DC INIT_XMM sse2 ADD_IDCT_DC INIT_XMM avx ADD_IDCT_DC %else ;!HIGH_BIT_DEPTH %macro ADD_DC 3 mova m4, [%3+FDEC_STRIDE*0] mova m5, [%3+FDEC_STRIDE*1] mova m6, [%3+FDEC_STRIDE*2] paddusb m4, %1 paddusb m5, %1 paddusb m6, %1 paddusb %1, [%3+FDEC_STRIDE*3] psubusb m4, %2 psubusb m5, %2 psubusb m6, %2 psubusb %1, %2 mova [%3+FDEC_STRIDE*0], m4 mova [%3+FDEC_STRIDE*1], m5 mova [%3+FDEC_STRIDE*2], m6 mova [%3+FDEC_STRIDE*3], %1 %endmacro INIT_MMX mmx2 cglobal add8x8_idct_dc, 2,2 mova m0, [r1] pxor m1, m1 add r0, FDEC_STRIDE*4 paddw m0, [pw_32] psraw m0, 6 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 punpcklbw m0, m0 punpcklbw m1, m1 pshufw m2, m0, q3322 pshufw m3, m1, q3322 punpcklbw m0, m0 punpcklbw m1, m1 ADD_DC m0, m1, r0-FDEC_STRIDE*4 ADD_DC m2, m3, r0 RET INIT_XMM ssse3 cglobal add8x8_idct_dc, 2,2 movh m0, [r1] pxor m1, m1 add r0, FDEC_STRIDE*4 pmulhrsw m0, [pw_512] psubw m1, m0 mova m5, [pb_unpackbd1] packuswb m0, m0 packuswb m1, m1 pshufb m0, m5 pshufb m1, m5 movh m2, [r0+FDEC_STRIDE*-4] movh m3, [r0+FDEC_STRIDE*-3] movh m4, [r0+FDEC_STRIDE*-2] movh m5, [r0+FDEC_STRIDE*-1] movhps m2, [r0+FDEC_STRIDE* 0] movhps m3, [r0+FDEC_STRIDE* 1] movhps m4, [r0+FDEC_STRIDE* 2] movhps m5, [r0+FDEC_STRIDE* 3] paddusb m2, m0 paddusb m3, m0 paddusb m4, m0 paddusb m5, m0 psubusb m2, m1 psubusb m3, m1 psubusb m4, m1 psubusb m5, m1 movh [r0+FDEC_STRIDE*-4], m2 movh [r0+FDEC_STRIDE*-3], m3 movh [r0+FDEC_STRIDE*-2], m4 movh [r0+FDEC_STRIDE*-1], m5 movhps [r0+FDEC_STRIDE* 0], m2 movhps [r0+FDEC_STRIDE* 1], m3 movhps [r0+FDEC_STRIDE* 2], m4 movhps [r0+FDEC_STRIDE* 3], m5 RET INIT_MMX mmx2 cglobal add16x16_idct_dc, 2,3 mov r2, 4 .loop: mova m0, [r1] pxor m1, m1 paddw m0, [pw_32] psraw m0, 6 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 punpcklbw m0, m0 punpcklbw m1, m1 pshufw m2, m0, q3322 pshufw m3, m1, q3322 punpcklbw m0, m0 punpcklbw m1, m1 ADD_DC m0, m1, r0 ADD_DC m2, m3, r0+8 add r1, 8 add r0, FDEC_STRIDE*4 dec r2 jg .loop RET INIT_XMM sse2 cglobal add16x16_idct_dc, 2,2,8 call .loop add r0, FDEC_STRIDE*4 TAIL_CALL .loop, 0 .loop: add r0, FDEC_STRIDE*4 movq m0, [r1+0] movq m2, [r1+8] add r1, 16 punpcklwd m0, m0 punpcklwd m2, m2 pxor m3, m3 paddw m0, [pw_32] paddw m2, [pw_32] psraw m0, 6 psraw m2, 6 psubw m1, m3, m0 packuswb m0, m1 psubw m3, m2 punpckhbw m1, m0, m0 packuswb m2, m3 punpckhbw m3, m2, m2 punpcklbw m0, m0 punpcklbw m2, m2 ADD_DC m0, m1, r0+FDEC_STRIDE*-4 ADD_DC m2, m3, r0 ret %macro ADD16x16 0 cglobal add16x16_idct_dc, 2,2,8 call .loop add r0, FDEC_STRIDE*4 TAIL_CALL .loop, 0 .loop: add r0, FDEC_STRIDE*4 mova m0, [r1] add r1, 16 pxor m1, m1 pmulhrsw m0, [pw_512] psubw m1, m0 mova m5, [pb_unpackbd1] mova m6, [pb_unpackbd2] packuswb m0, m0 packuswb m1, m1 pshufb m2, m0, m6 pshufb m0, m5 pshufb m3, m1, m6 pshufb m1, m5 ADD_DC m0, m1, r0+FDEC_STRIDE*-4 ADD_DC m2, m3, r0 ret %endmacro ; ADD16x16 INIT_XMM ssse3 ADD16x16 INIT_XMM avx ADD16x16 %macro ADD_DC_AVX2 3 mova xm4, [r0+FDEC_STRIDE*0+%3] mova xm5, [r0+FDEC_STRIDE*1+%3] vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1 vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1 paddusb m4, %1 paddusb m5, %1 psubusb m4, %2 psubusb m5, %2 mova [r0+FDEC_STRIDE*0+%3], xm4 mova [r0+FDEC_STRIDE*1+%3], xm5 vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1 vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1 %endmacro INIT_YMM avx2 cglobal add16x16_idct_dc, 2,3,6 add r0, FDEC_STRIDE*4 mova m0, [r1] pxor m1, m1 pmulhrsw m0, [pw_512] psubw m1, m0 mova m4, [pb_unpackbd1] mova m5, [pb_unpackbd2] packuswb m0, m0 packuswb m1, m1 pshufb m2, m0, m4 ; row0, row2 pshufb m3, m1, m4 ; row0, row2 pshufb m0, m5 ; row1, row3 pshufb m1, m5 ; row1, row3 lea r2, [r0+FDEC_STRIDE*8] ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4 ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2 ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0 ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 ) ;----------------------------------------------------------------------------- %macro DCTDC_2ROW_MMX 4 mova %1, [r1+FENC_STRIDE*(0+%3)] mova m1, [r1+FENC_STRIDE*(1+%3)] mova m2, [r2+FDEC_STRIDE*(0+%4)] mova m3, [r2+FDEC_STRIDE*(1+%4)] mova %2, %1 punpckldq %1, m1 punpckhdq %2, m1 mova m1, m2 punpckldq m2, m3 punpckhdq m1, m3 pxor m3, m3 psadbw %1, m3 psadbw %2, m3 psadbw m2, m3 psadbw m1, m3 psubw %1, m2 psubw %2, m1 %endmacro %macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1) PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0 PSHUFLW m0, %2, q2301 ; s3 __ s2 __ paddw m1, %2 ; s1 s13 s0 s02 psubw m1, m0 ; d13 s13 d02 s02 PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02 psrlq m1, 32 ; __ __ d13 s13 paddw m0, m1 ; d02 s02 d02+d13 s02+s13 psllq m1, 32 ; d13 s13 psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13 %endmacro %if HIGH_BIT_DEPTH == 0 INIT_MMX mmx2 cglobal sub8x8_dct_dc, 3,3 DCTDC_2ROW_MMX m0, m4, 0, 0 DCTDC_2ROW_MMX m5, m6, 2, 2 paddw m0, m5 paddw m4, m6 punpckldq m0, m4 add r2, FDEC_STRIDE*4 DCTDC_2ROW_MMX m7, m4, 4, 0 DCTDC_2ROW_MMX m5, m6, 6, 2 paddw m7, m5 paddw m4, m6 punpckldq m7, m4 DCT2x2 m0, m7 mova [r0], m0 ret %macro DCTDC_2ROW_SSE2 4 movh m1, [r1+FENC_STRIDE*(0+%1)] movh m2, [r1+FENC_STRIDE*(1+%1)] punpckldq m1, m2 movh m2, [r2+FDEC_STRIDE*(0+%2)] punpckldq m2, [r2+FDEC_STRIDE*(1+%2)] psadbw m1, m0 psadbw m2, m0 ACCUM paddd, %4, 1, %3 psubd m%4, m2 %endmacro INIT_XMM sse2 cglobal sub8x8_dct_dc, 3,3 pxor m0, m0 DCTDC_2ROW_SSE2 0, 0, 0, 3 DCTDC_2ROW_SSE2 2, 2, 1, 3 add r2, FDEC_STRIDE*4 DCTDC_2ROW_SSE2 4, 0, 0, 4 DCTDC_2ROW_SSE2 6, 2, 1, 4 packssdw m3, m3 packssdw m4, m4 DCT2x2 m3, m4 movq [r0], m0 RET %macro SUB8x16_DCT_DC 0 cglobal sub8x16_dct_dc, 3,3 pxor m0, m0 DCTDC_2ROW_SSE2 0, 0, 0, 3 DCTDC_2ROW_SSE2 2, 2, 1, 3 add r1, FENC_STRIDE*8 add r2, FDEC_STRIDE*8 DCTDC_2ROW_SSE2 -4, -4, 0, 4 DCTDC_2ROW_SSE2 -2, -2, 1, 4 shufps m3, m4, q2020 DCTDC_2ROW_SSE2 0, 0, 0, 5 DCTDC_2ROW_SSE2 2, 2, 1, 5 add r2, FDEC_STRIDE*4 DCTDC_2ROW_SSE2 4, 0, 0, 4 DCTDC_2ROW_SSE2 6, 2, 1, 4 shufps m5, m4, q2020 %if cpuflag(ssse3) %define %%sign psignw %else %define %%sign pmullw %endif SUMSUB_BA d, 5, 3, 0 packssdw m5, m3 pshuflw m0, m5, q2301 pshufhw m0, m0, q2301 %%sign m5, [pw_pmpmpmpm] paddw m0, m5 pshufd m1, m0, q1320 pshufd m0, m0, q0231 %%sign m1, [pw_ppppmmmm] paddw m0, m1 mova [r0], m0 RET %endmacro ; SUB8x16_DCT_DC INIT_XMM sse2 SUB8x16_DCT_DC INIT_XMM ssse3 SUB8x16_DCT_DC %endif ; !HIGH_BIT_DEPTH %macro DCTDC_4ROW_SSE2 2 mova %1, [r1+FENC_STRIDEB*%2] mova m0, [r2+FDEC_STRIDEB*%2] %assign Y (%2+1) %rep 3 paddw %1, [r1+FENC_STRIDEB*Y] paddw m0, [r2+FDEC_STRIDEB*Y] %assign Y (Y+1) %endrep psubw %1, m0 pshufd m0, %1, q2301 paddw %1, m0 %endmacro %if HIGH_BIT_DEPTH %macro SUB8x8_DCT_DC_10 0 cglobal sub8x8_dct_dc, 3,3,3 DCTDC_4ROW_SSE2 m1, 0 DCTDC_4ROW_SSE2 m2, 4 mova m0, [pw_ppmmmmpp] pmaddwd m1, m0 pmaddwd m2, m0 pshufd m0, m1, q2200 ; -1 -1 +0 +0 pshufd m1, m1, q0033 ; +0 +0 +1 +1 paddd m1, m0 pshufd m0, m2, q1023 ; -2 +2 -3 +3 paddd m1, m2 paddd m1, m0 mova [r0], m1 RET %endmacro INIT_XMM sse2 SUB8x8_DCT_DC_10 %macro SUB8x16_DCT_DC_10 0 cglobal sub8x16_dct_dc, 3,3,6 DCTDC_4ROW_SSE2 m1, 0 DCTDC_4ROW_SSE2 m2, 4 DCTDC_4ROW_SSE2 m3, 8 DCTDC_4ROW_SSE2 m4, 12 mova m0, [pw_ppmmmmpp] pmaddwd m1, m0 pmaddwd m2, m0 pshufd m5, m1, q2200 ; -1 -1 +0 +0 pshufd m1, m1, q0033 ; +0 +0 +1 +1 paddd m1, m5 pshufd m5, m2, q1023 ; -2 +2 -3 +3 paddd m1, m2 paddd m1, m5 ; a6 a2 a4 a0 pmaddwd m3, m0 pmaddwd m4, m0 pshufd m5, m3, q2200 pshufd m3, m3, q0033 paddd m3, m5 pshufd m5, m4, q1023 paddd m3, m4 paddd m3, m5 ; a7 a3 a5 a1 paddd m0, m1, m3 psubd m1, m3 pshufd m0, m0, q3120 pshufd m1, m1, q3120 punpcklqdq m2, m0, m1 punpckhqdq m1, m0 mova [r0+ 0], m2 mova [r0+16], m1 RET %endmacro INIT_XMM sse2 SUB8x16_DCT_DC_10 INIT_XMM avx SUB8x16_DCT_DC_10 %endif ;----------------------------------------------------------------------------- ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- %macro SCAN_8x8 0 cglobal zigzag_scan_8x8_frame, 2,2,8 movdqa xmm0, [r1] movdqa xmm1, [r1+16] movdq2q mm0, xmm0 PALIGNR xmm1, xmm1, 14, xmm2 movdq2q mm1, xmm1 movdqa xmm2, [r1+32] movdqa xmm3, [r1+48] PALIGNR xmm2, xmm2, 12, xmm4 movdq2q mm2, xmm2 PALIGNR xmm3, xmm3, 10, xmm4 movdq2q mm3, xmm3 punpckhwd xmm0, xmm1 punpckhwd xmm2, xmm3 movq mm4, mm1 movq mm5, mm1 movq mm6, mm2 movq mm7, mm3 punpckhwd mm1, mm0 psllq mm0, 16 psrlq mm3, 16 punpckhdq mm1, mm1 punpckhdq mm2, mm0 punpcklwd mm0, mm4 punpckhwd mm4, mm3 punpcklwd mm4, mm2 punpckhdq mm0, mm2 punpcklwd mm6, mm3 punpcklwd mm5, mm7 punpcklwd mm5, mm6 movdqa xmm4, [r1+64] movdqa xmm5, [r1+80] movdqa xmm6, [r1+96] movdqa xmm7, [r1+112] movq [r0+2*00], mm0 movq [r0+2*04], mm4 movd [r0+2*08], mm1 movq [r0+2*36], mm5 movq [r0+2*46], mm6 PALIGNR xmm4, xmm4, 14, xmm3 movdq2q mm4, xmm4 PALIGNR xmm5, xmm5, 12, xmm3 movdq2q mm5, xmm5 PALIGNR xmm6, xmm6, 10, xmm3 movdq2q mm6, xmm6 %if cpuflag(ssse3) PALIGNR xmm7, xmm7, 8, xmm3 movdq2q mm7, xmm7 %else movhlps xmm3, xmm7 punpcklqdq xmm7, xmm7 movdq2q mm7, xmm3 %endif punpckhwd xmm4, xmm5 punpckhwd xmm6, xmm7 movq mm0, mm4 movq mm1, mm5 movq mm3, mm7 punpcklwd mm7, mm6 psrlq mm6, 16 punpcklwd mm4, mm6 punpcklwd mm5, mm4 punpckhdq mm4, mm3 punpcklwd mm3, mm6 punpckhwd mm3, mm4 punpckhwd mm0, mm1 punpckldq mm4, mm0 punpckhdq mm0, mm6 pshufw mm4, mm4, q1230 movq [r0+2*14], mm4 movq [r0+2*25], mm0 movd [r0+2*54], mm7 movq [r0+2*56], mm5 movq [r0+2*60], mm3 punpckhdq xmm3, xmm0, xmm2 punpckldq xmm0, xmm2 punpckhdq xmm7, xmm4, xmm6 punpckldq xmm4, xmm6 pshufhw xmm0, xmm0, q0123 pshuflw xmm4, xmm4, q0123 pshufhw xmm3, xmm3, q0123 pshuflw xmm7, xmm7, q0123 movlps [r0+2*10], xmm0 movhps [r0+2*17], xmm0 movlps [r0+2*21], xmm3 movlps [r0+2*28], xmm4 movhps [r0+2*32], xmm3 movhps [r0+2*39], xmm4 movlps [r0+2*43], xmm7 movhps [r0+2*50], xmm7 RET %endmacro %if HIGH_BIT_DEPTH == 0 INIT_XMM sse2 SCAN_8x8 INIT_XMM ssse3 SCAN_8x8 %endif ;----------------------------------------------------------------------------- ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] ) ;----------------------------------------------------------------------------- ; Output order: ; 0 8 1 2 9 16 24 17 ; 10 3 4 11 18 25 32 40 ; 33 26 19 12 5 6 13 20 ; 27 34 41 48 56 49 42 35 ; 28 21 14 7 15 22 29 36 ; 43 50 57 58 51 44 37 30 ; 23 31 38 45 52 59 60 53 ; 46 39 47 54 61 62 55 63 %macro SCAN_8x8_FRAME 5 cglobal zigzag_scan_8x8_frame, 2,2,8 mova m0, [r1] mova m1, [r1+ 8*SIZEOF_DCTCOEF] movu m2, [r1+14*SIZEOF_DCTCOEF] movu m3, [r1+21*SIZEOF_DCTCOEF] mova m4, [r1+28*SIZEOF_DCTCOEF] punpckl%4 m5, m0, m1 psrl%2 m0, %1 punpckh%4 m6, m1, m0 punpckl%3 m5, m0 punpckl%3 m1, m1 punpckh%4 m1, m3 mova m7, [r1+52*SIZEOF_DCTCOEF] mova m0, [r1+60*SIZEOF_DCTCOEF] punpckh%4 m1, m2 punpckl%4 m2, m4 punpckh%4 m4, m3 punpckl%3 m3, m3 punpckh%4 m3, m2 mova [r0], m5 mova [r0+ 4*SIZEOF_DCTCOEF], m1 mova [r0+ 8*SIZEOF_DCTCOEF], m6 punpckl%4 m6, m0 punpckl%4 m6, m7 mova m1, [r1+32*SIZEOF_DCTCOEF] movu m5, [r1+39*SIZEOF_DCTCOEF] movu m2, [r1+46*SIZEOF_DCTCOEF] movu [r0+35*SIZEOF_DCTCOEF], m3 movu [r0+47*SIZEOF_DCTCOEF], m4 punpckh%4 m7, m0 psll%2 m0, %1 punpckh%3 m3, m5, m5 punpckl%4 m5, m1 punpckh%4 m1, m2 mova [r0+52*SIZEOF_DCTCOEF], m6 movu [r0+13*SIZEOF_DCTCOEF], m5 movu m4, [r1+11*SIZEOF_DCTCOEF] movu m6, [r1+25*SIZEOF_DCTCOEF] punpckl%4 m5, m7 punpckl%4 m1, m3 punpckh%3 m0, m7 mova m3, [r1+ 4*SIZEOF_DCTCOEF] movu m7, [r1+18*SIZEOF_DCTCOEF] punpckl%4 m2, m5 movu [r0+25*SIZEOF_DCTCOEF], m1 mova m1, m4 mova m5, m6 punpckl%4 m4, m3 punpckl%4 m6, m7 punpckh%4 m1, m3 punpckh%4 m5, m7 punpckh%3 m3, m6, m4 punpckh%3 m7, m5, m1 punpckl%3 m6, m4 punpckl%3 m5, m1 movu m4, [r1+35*SIZEOF_DCTCOEF] movu m1, [r1+49*SIZEOF_DCTCOEF] pshuf%5 m6, m6, q0123 pshuf%5 m5, m5, q0123 mova [r0+60*SIZEOF_DCTCOEF], m0 mova [r0+56*SIZEOF_DCTCOEF], m2 movu m0, [r1+42*SIZEOF_DCTCOEF] mova m2, [r1+56*SIZEOF_DCTCOEF] movu [r0+17*SIZEOF_DCTCOEF], m3 mova [r0+32*SIZEOF_DCTCOEF], m7 movu [r0+10*SIZEOF_DCTCOEF], m6 movu [r0+21*SIZEOF_DCTCOEF], m5 punpckh%4 m3, m0, m4 punpckh%4 m7, m2, m1 punpckl%4 m0, m4 punpckl%4 m2, m1 punpckl%3 m4, m2, m0 punpckl%3 m1, m7, m3 punpckh%3 m2, m0 punpckh%3 m7, m3 pshuf%5 m2, m2, q0123 pshuf%5 m7, m7, q0123 mova [r0+28*SIZEOF_DCTCOEF], m4 movu [r0+43*SIZEOF_DCTCOEF], m1 movu [r0+39*SIZEOF_DCTCOEF], m2 movu [r0+50*SIZEOF_DCTCOEF], m7 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 SCAN_8x8_FRAME 4 , dq, qdq, dq, d INIT_XMM avx SCAN_8x8_FRAME 4 , dq, qdq, dq, d %else INIT_MMX mmx2 SCAN_8x8_FRAME 16, q , dq , wd, w %endif ;----------------------------------------------------------------------------- ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] ) ;----------------------------------------------------------------------------- %macro SCAN_4x4 4 cglobal zigzag_scan_4x4_frame, 2,2,6 mova m0, [r1+ 0*SIZEOF_DCTCOEF] mova m1, [r1+ 4*SIZEOF_DCTCOEF] mova m2, [r1+ 8*SIZEOF_DCTCOEF] mova m3, [r1+12*SIZEOF_DCTCOEF] punpckl%4 m4, m0, m1 psrl%2 m0, %1 punpckl%3 m4, m0 mova [r0+ 0*SIZEOF_DCTCOEF], m4 punpckh%4 m0, m2 punpckh%4 m4, m2, m3 psll%2 m3, %1 punpckl%3 m2, m2 punpckl%4 m5, m1, m3 punpckh%3 m1, m1 punpckh%4 m5, m2 punpckl%4 m1, m0 punpckh%3 m3, m4 mova [r0+ 4*SIZEOF_DCTCOEF], m5 mova [r0+ 8*SIZEOF_DCTCOEF], m1 mova [r0+12*SIZEOF_DCTCOEF], m3 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 SCAN_4x4 4, dq, qdq, dq INIT_XMM avx SCAN_4x4 4, dq, qdq, dq %else INIT_MMX mmx SCAN_4x4 16, q , dq , wd ;----------------------------------------------------------------------------- ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- %macro SCAN_4x4_FRAME 0 cglobal zigzag_scan_4x4_frame, 2,2 mova m1, [r1+16] mova m0, [r1+ 0] pshufb m1, [pb_scan4frameb] pshufb m0, [pb_scan4framea] psrldq m2, m1, 6 palignr m1, m0, 6 pslldq m0, 10 palignr m2, m0, 10 mova [r0+ 0], m1 mova [r0+16], m2 RET %endmacro INIT_XMM ssse3 SCAN_4x4_FRAME INIT_XMM avx SCAN_4x4_FRAME INIT_XMM xop cglobal zigzag_scan_4x4_frame, 2,2 mova m0, [r1+ 0] mova m1, [r1+16] vpperm m2, m0, m1, [pb_scan4frame2a] vpperm m1, m0, m1, [pb_scan4frame2b] mova [r0+ 0], m2 mova [r0+16], m1 RET %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] ) ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal zigzag_scan_4x4_field, 2,2 movu m0, [r1+ 8] pshufd m0, m0, q3102 mova m1, [r1+32] mova m2, [r1+48] movu [r0+ 8], m0 mova [r0+32], m1 mova [r0+48], m2 movq mm0, [r1] movq [r0], mm0 movq mm0, [r1+24] movq [r0+24], mm0 RET %else ;----------------------------------------------------------------------------- ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) ;----------------------------------------------------------------------------- INIT_XMM sse cglobal zigzag_scan_4x4_field, 2,2 mova m0, [r1] mova m1, [r1+16] pshufw mm0, [r1+4], q3102 mova [r0], m0 mova [r0+16], m1 movq [r0+4], mm0 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- ; Output order: ; 0 1 2 8 9 3 4 10 ; 16 11 5 6 7 12 17 24 ; 18 13 14 15 19 25 32 26 ; 20 21 22 23 27 33 40 34 ; 28 29 30 31 35 41 48 42 ; 36 37 38 39 43 49 50 44 ; 45 46 47 51 56 57 52 53 ; 54 55 58 59 60 61 62 63 %undef SCAN_8x8 %macro SCAN_8x8 5 cglobal zigzag_scan_8x8_field, 2,3,8 mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08 pshuf%1 m3, m0, q3333 ; 03 03 03 03 movd r2d, m2 ; 09 08 pshuf%1 m2, m2, q0321 ; 08 11 10 09 punpckl%2 m3, m1 ; 05 03 04 03 pinsr%1 m0, r2d, 3 ; 08 02 01 00 punpckl%2 m4, m2, m3 ; 04 10 03 09 pshuf%1 m4, m4, q2310 ; 10 04 03 09 mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00 mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09 mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12 mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16 punpckl%3 m6, m5 ; 17 16 XX XX psrl%4 m1, %5 ; XX 07 06 05 punpckh%2 m6, m2 ; 08 17 11 16 punpckl%3 m6, m1 ; 06 05 11 16 mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16 psrl%4 m1, %5 ; XX XX 07 06 punpckl%2 m1, m5 ; 17 07 16 06 mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20 mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24 punpckh%3 m1, m1 ; 17 07 17 07 punpckl%2 m6, m3, m2 ; 25 13 24 12 pextr%1 r2d, m5, 2 mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20 punpckl%2 m1, m6 ; 24 17 12 07 mova [r0+12*SIZEOF_DCTCOEF], m1 pinsr%1 m3, r2d, 0 ; 15 14 13 18 mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18 mova m7, [r1+28*SIZEOF_DCTCOEF] mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32 psrl%4 m5, %5*3 ; XX XX XX 19 pshuf%1 m1, m2, q3321 ; 27 27 26 25 punpckl%2 m5, m0 ; 33 XX 32 19 psrl%4 m2, %5*3 ; XX XX XX 27 punpckl%2 m5, m1 ; 26 32 25 19 mova [r0+32*SIZEOF_DCTCOEF], m7 mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19 mova m7, [r1+36*SIZEOF_DCTCOEF] mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40 pshuf%1 m3, m0, q3321 ; 35 35 34 33 punpckl%2 m2, m1 ; 41 XX 40 27 mova [r0+40*SIZEOF_DCTCOEF], m7 punpckl%2 m2, m3 ; 34 40 33 27 mova [r0+28*SIZEOF_DCTCOEF], m2 mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44 mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48 psrl%4 m0, %5*3 ; XX XX XX 35 punpckl%2 m0, m2 ; 49 XX 48 35 pshuf%1 m3, m1, q3321 ; 43 43 42 41 punpckl%2 m0, m3 ; 42 48 41 35 mova [r0+36*SIZEOF_DCTCOEF], m0 pextr%1 r2d, m2, 3 ; 51 psrl%4 m1, %5*3 ; XX XX XX 43 punpckl%2 m1, m7 ; 45 XX 44 43 psrl%4 m2, %5 ; XX 51 50 49 punpckl%2 m1, m2 ; 50 44 49 43 pshuf%1 m1, m1, q2310 ; 44 50 49 43 mova [r0+44*SIZEOF_DCTCOEF], m1 psrl%4 m7, %5 ; XX 47 46 45 pinsr%1 m7, r2d, 3 ; 51 47 46 45 mova [r0+48*SIZEOF_DCTCOEF], m7 mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56 mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52 mova m7, [r1+60*SIZEOF_DCTCOEF] punpckl%3 m2, m0, m1 ; 53 52 57 56 punpckh%3 m1, m0 ; 59 58 55 54 mova [r0+52*SIZEOF_DCTCOEF], m2 mova [r0+56*SIZEOF_DCTCOEF], m1 mova [r0+60*SIZEOF_DCTCOEF], m7 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse4 SCAN_8x8 d, dq, qdq, dq, 4 INIT_XMM avx SCAN_8x8 d, dq, qdq, dq, 4 %else INIT_MMX mmx2 SCAN_8x8 w, wd, dq , q , 16 %endif ;----------------------------------------------------------------------------- ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst ) ;----------------------------------------------------------------------------- %macro ZIGZAG_SUB_4x4 2 %ifidn %1, ac cglobal zigzag_sub_4x4%1_%2, 4,4,8 %else cglobal zigzag_sub_4x4%1_%2, 3,3,8 %endif movd m0, [r1+0*FENC_STRIDE] movd m1, [r1+1*FENC_STRIDE] movd m2, [r1+2*FENC_STRIDE] movd m3, [r1+3*FENC_STRIDE] movd m4, [r2+0*FDEC_STRIDE] movd m5, [r2+1*FDEC_STRIDE] movd m6, [r2+2*FDEC_STRIDE] movd m7, [r2+3*FDEC_STRIDE] movd [r2+0*FDEC_STRIDE], m0 movd [r2+1*FDEC_STRIDE], m1 movd [r2+2*FDEC_STRIDE], m2 movd [r2+3*FDEC_STRIDE], m3 punpckldq m0, m1 punpckldq m2, m3 punpckldq m4, m5 punpckldq m6, m7 punpcklqdq m0, m2 punpcklqdq m4, m6 mova m7, [pb_sub4%2] pshufb m0, m7 pshufb m4, m7 mova m7, [hsub_mul] punpckhbw m1, m0, m4 punpcklbw m0, m4 pmaddubsw m1, m7 pmaddubsw m0, m7 %ifidn %1, ac movd r2d, m0 pand m0, [pb_subacmask] %endif mova [r0+ 0], m0 por m0, m1 pxor m2, m2 mova [r0+16], m1 pcmpeqb m0, m2 pmovmskb eax, m0 %ifidn %1, ac mov [r3], r2w %endif sub eax, 0xffff shr eax, 31 RET %endmacro %if HIGH_BIT_DEPTH == 0 INIT_XMM ssse3 ZIGZAG_SUB_4x4 , frame ZIGZAG_SUB_4x4 ac, frame ZIGZAG_SUB_4x4 , field ZIGZAG_SUB_4x4 ac, field INIT_XMM avx ZIGZAG_SUB_4x4 , frame ZIGZAG_SUB_4x4 ac, frame ZIGZAG_SUB_4x4 , field ZIGZAG_SUB_4x4 ac, field %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 INIT_XMM xop cglobal zigzag_scan_8x8_field, 2,3,7 lea r2, [pb_scan8field1] %define off(m) (r2+m-pb_scan8field1) mova m0, [r1+ 0] mova m1, [r1+ 16] vpperm m5, m0, m1, [off(pb_scan8field1)] mova [r0+ 0], m5 vpperm m0, m0, m1, [off(pb_scan8field2a)] mova m2, [r1+ 32] mova m3, [r1+ 48] vpperm m5, m2, m3, [off(pb_scan8field2b)] por m5, m0 mova [r0+ 16], m5 mova m4, [off(pb_scan8field3b)] vpperm m1, m1, m2, [off(pb_scan8field3a)] mova m0, [r1+ 64] vpperm m5, m3, m0, m4 por m5, m1 mova [r0+ 32], m5 ; 4b, 5b are the same as pb_scan8field3b. ; 5a is the same as pb_scan8field4a. mova m5, [off(pb_scan8field4a)] vpperm m2, m2, m3, m5 mova m1, [r1+ 80] vpperm m6, m0, m1, m4 por m6, m2 mova [r0+ 48], m6 vpperm m3, m3, m0, m5 mova m2, [r1+ 96] vpperm m5, m1, m2, m4 por m5, m3 mova [r0+ 64], m5 vpperm m5, m0, m1, [off(pb_scan8field6)] mova [r0+ 80], m5 vpperm m5, m1, m2, [off(pb_scan8field7)] mov r2d, [r1+ 98] mov [r0+ 90], r2d mova [r0+ 96], m5 mova m3, [r1+112] movd [r0+104], m3 mov r2d, [r1+108] mova [r0+112], m3 mov [r0+112], r2d %undef off RET cglobal zigzag_scan_8x8_frame, 2,3,8 lea r2, [pb_scan8frame1] %define off(m) (r2+m-pb_scan8frame1) mova m7, [r1+ 16] mova m3, [r1+ 32] vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22 mova m2, [r1+ 48] vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30 mova m1, [r1+ 80] mova m4, [r1+ 64] vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45 vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45 vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44 vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39 mova m4, [r1+ 96] vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55 mova m1, [r1+ 0] vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36 vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17 mova [r0+ 0], m1 movh m0, [r1+ 6] movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13 vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40 mova [r0+ 16], m1 vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20 mova [r0+ 32], m1 vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36 mova [r0+ 64], m1 movh m0, [r1+100] movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60 vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30 mova [r0+ 80], m1 vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53 mova [r0+ 96], m1 mova m1, [r1+112] vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63 vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35 mova [r0+ 48], m1 vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63 mova [r0+112], m1 %undef off RET %endif ;----------------------------------------------------------------------------- ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz ) ;----------------------------------------------------------------------------- %macro INTERLEAVE 2 mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL] mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL] mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL] mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL] TRANSPOSE4x4%2 0,1,2,3,4 mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0 mova [r0+(%1+32)*SIZEOF_PIXEL], m1 mova [r0+(%1+64)*SIZEOF_PIXEL], m2 mova [r0+(%1+96)*SIZEOF_PIXEL], m3 packsswb m0, m1 ACCUM por, 6, 2, %1 ACCUM por, 7, 3, %1 ACCUM por, 5, 0, %1 %endmacro %macro ZIGZAG_8x8_CAVLC 1 cglobal zigzag_interleave_8x8_cavlc, 3,3,8 INTERLEAVE 0, %1 INTERLEAVE 8, %1 INTERLEAVE 16, %1 INTERLEAVE 24, %1 packsswb m6, m7 packsswb m5, m6 packsswb m5, m5 pxor m0, m0 %if HIGH_BIT_DEPTH packsswb m5, m5 %endif pcmpeqb m5, m0 paddb m5, [pb_1] movd r0d, m5 mov [r2+0], r0w shr r0d, 16 mov [r2+8], r0w RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 ZIGZAG_8x8_CAVLC D INIT_XMM avx ZIGZAG_8x8_CAVLC D %else INIT_MMX mmx ZIGZAG_8x8_CAVLC W %endif %macro INTERLEAVE_XMM 1 mova m0, [r1+%1*4+ 0] mova m1, [r1+%1*4+16] mova m4, [r1+%1*4+32] mova m5, [r1+%1*4+48] SBUTTERFLY wd, 0, 1, 6 SBUTTERFLY wd, 4, 5, 7 SBUTTERFLY wd, 0, 1, 6 SBUTTERFLY wd, 4, 5, 7 movh [r0+%1+ 0], m0 movhps [r0+%1+ 32], m0 movh [r0+%1+ 64], m1 movhps [r0+%1+ 96], m1 movh [r0+%1+ 8], m4 movhps [r0+%1+ 40], m4 movh [r0+%1+ 72], m5 movhps [r0+%1+104], m5 ACCUM por, 2, 0, %1 ACCUM por, 3, 1, %1 por m2, m4 por m3, m5 %endmacro %if HIGH_BIT_DEPTH == 0 %macro ZIGZAG_8x8_CAVLC 0 cglobal zigzag_interleave_8x8_cavlc, 3,3,8 INTERLEAVE_XMM 0 INTERLEAVE_XMM 16 packsswb m2, m3 pxor m5, m5 packsswb m2, m2 packsswb m2, m2 pcmpeqb m5, m2 paddb m5, [pb_1] movd r0d, m5 mov [r2+0], r0w shr r0d, 16 mov [r2+8], r0w RET %endmacro INIT_XMM sse2 ZIGZAG_8x8_CAVLC INIT_XMM avx ZIGZAG_8x8_CAVLC INIT_YMM avx2 cglobal zigzag_interleave_8x8_cavlc, 3,3,6 mova m0, [r1+ 0] mova m1, [r1+32] mova m2, [r1+64] mova m3, [r1+96] mova m5, [deinterleave_shufd] SBUTTERFLY wd, 0, 1, 4 SBUTTERFLY wd, 2, 3, 4 SBUTTERFLY wd, 0, 1, 4 SBUTTERFLY wd, 2, 3, 4 vpermd m0, m5, m0 vpermd m1, m5, m1 vpermd m2, m5, m2 vpermd m3, m5, m3 mova [r0+ 0], xm0 mova [r0+ 16], xm2 vextracti128 [r0+ 32], m0, 1 vextracti128 [r0+ 48], m2, 1 mova [r0+ 64], xm1 mova [r0+ 80], xm3 vextracti128 [r0+ 96], m1, 1 vextracti128 [r0+112], m3, 1 packsswb m0, m2 ; nnz0, nnz1 packsswb m1, m3 ; nnz2, nnz3 packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3} vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3} pxor m5, m5 pcmpeqq m0, m5 pmovmskb r0d, m0 not r0d and r0d, 0x01010101 mov [r2+0], r0w shr r0d, 16 mov [r2+8], r0w RET %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH INIT_ZMM avx512 cglobal zigzag_scan_4x4_frame, 2,2 mova m0, [scan_frame_avx512] vpermd m0, m0, [r1] mova [r0], m0 RET cglobal zigzag_scan_4x4_field, 2,2 mova m0, [r1] pshufd xmm1, [r1+8], q3102 mova [r0], m0 movu [r0+8], xmm1 RET cglobal zigzag_scan_8x8_frame, 2,2 psrld m0, [scan_frame_avx512], 4 mova m1, [r1+0*64] mova m2, [r1+1*64] mova m3, [r1+2*64] mova m4, [r1+3*64] mov r1d, 0x01fe7f80 kmovd k1, r1d kshiftrd k2, k1, 16 vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40 psrld m6, m0, 5 vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __ vmovdqa64 m0 {k1}, m5 mova [r0+0*64], m0 mova m5, m1 vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __ psrld m0, m6, 5 vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35 vmovdqa32 m6 {k2}, m1 mova [r0+1*64], m6 vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30 psrld m1, m0, 5 vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __ vmovdqa32 m5 {k1}, m0 mova [r0+2*64], m5 vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63 vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __ vmovdqa64 m2 {k2}, m3 mova [r0+3*64], m2 RET cglobal zigzag_scan_8x8_field, 2,2 mova m0, [scan_field_avx512] mova m1, [r1+0*64] mova m2, [r1+1*64] mova m3, [r1+2*64] mova m4, [r1+3*64] mov r1d, 0x3f kmovb k1, r1d psrld m5, m0, 5 vpermi2d m0, m1, m2 vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15 vpermt2d m1, m5, m2 psrld m5, 5 vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31 vpermt2d m2, m5, m3 psrld m5, 5 vpermt2d m3, m5, m4 mova [r0+0*64], m0 mova [r0+1*64], m1 mova [r0+2*64], m2 mova [r0+3*64], m3 RET cglobal zigzag_interleave_8x8_cavlc, 3,3 mova m0, [cavlc_shuf_avx512] mova m1, [r1+0*64] mova m2, [r1+1*64] mova m3, [r1+2*64] mova m4, [r1+3*64] kxnorb k1, k1, k1 por m7, m1, m2 psrld m5, m0, 5 vpermi2d m0, m1, m2 ; a0 a1 b0 b1 vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4 psrld m6, m5, 5 vpermi2d m5, m3, m4 ; b2 b3 a2 a3 vptestmd k0, m7, m7 vpermt2d m1, m6, m2 ; c0 c1 d0 d1 psrld m6, 5 vpermt2d m3, m6, m4 ; d2 d3 c2 c3 vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3 vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3 vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3 vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3 mova [r0+0*64], m5 mova [r0+1*64], m2 mova [r0+2*64], m3 mova [r0+3*64], m4 kmovw r1d, k0 test r1d, 0x1111 setnz [r2] test r1d, 0x2222 setnz [r2+1] test r1d, 0x4444 setnz [r2+8] test r1d, 0x8888 setnz [r2+9] RET %else ; !HIGH_BIT_DEPTH INIT_YMM avx512 cglobal zigzag_scan_4x4_frame, 2,2 mova m0, [scan_frame_avx512] vpermw m0, m0, [r1] mova [r0], m0 RET cglobal zigzag_scan_4x4_field, 2,2 mova m0, [r1] pshuflw xmm1, [r1+4], q3102 mova [r0], m0 movq [r0+4], xmm1 RET INIT_ZMM avx512 cglobal zigzag_scan_8x8_frame, 2,2 psrlw m0, [scan_frame_avx512], 4 scan8_avx512: mova m1, [r1] mova m2, [r1+64] psrlw m3, m0, 6 vpermi2w m0, m1, m2 vpermt2w m1, m3, m2 mova [r0], m0 mova [r0+64], m1 RET cglobal zigzag_scan_8x8_field, 2,2 mova m0, [scan_field_avx512] jmp scan8_avx512 cglobal zigzag_interleave_8x8_cavlc, 3,3 mova m0, [cavlc_shuf_avx512] mova m1, [r1] mova m2, [r1+64] psrlw m3, m0, 6 vpermi2w m0, m1, m2 vpermt2w m1, m3, m2 kxnorb k2, k2, k2 vptestmd k0, m0, m0 vptestmd k1, m1, m1 mova [r0], m0 mova [r0+64], m1 ktestw k2, k0 setnz [r2] setnc [r2+1] ktestw k2, k1 setnz [r2+8] setnc [r2+9] RET %endif ; !HIGH_BIT_DEPTH x264-master/common/x86/dct.h000066400000000000000000000410141502133446700157170ustar00rootroot00000000000000/***************************************************************************** * dct.h: x86 transform and zigzag ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_DCT_H #define X264_X86_DCT_H #define x264_sub4x4_dct_mmx x264_template(sub4x4_dct_mmx) void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 ); #define x264_sub8x8_dct_mmx x264_template(sub8x8_dct_mmx) void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 ); #define x264_sub16x16_dct_mmx x264_template(sub16x16_dct_mmx) void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ); #define x264_sub8x8_dct_sse2 x264_template(sub8x8_dct_sse2) void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_sse2 x264_template(sub16x16_dct_sse2) void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub4x4_dct_ssse3 x264_template(sub4x4_dct_ssse3) void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub4x4_dct_avx512 x264_template(sub4x4_dct_avx512) void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_ssse3 x264_template(sub8x8_dct_ssse3) void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_ssse3 x264_template(sub16x16_dct_ssse3) void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_avx x264_template(sub8x8_dct_avx) void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_avx x264_template(sub16x16_dct_avx) void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_xop x264_template(sub8x8_dct_xop) void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_xop x264_template(sub16x16_dct_xop) void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_avx2 x264_template(sub8x8_dct_avx2) void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_avx512 x264_template(sub8x8_dct_avx512) void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_avx2 x264_template(sub16x16_dct_avx2) void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct_avx512 x264_template(sub16x16_dct_avx512) void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_dc_mmx2 x264_template(sub8x8_dct_dc_mmx2) void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct_dc_sse2 x264_template(sub8x8_dct_dc_sse2) void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); #define x264_sub8x8_dct_dc_avx512 x264_template(sub8x8_dct_dc_avx512) void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x16_dct_dc_sse2 x264_template(sub8x16_dct_dc_sse2) void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 ); #define x264_sub8x16_dct_dc_ssse3 x264_template(sub8x16_dct_dc_ssse3) void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x16_dct_dc_avx x264_template(sub8x16_dct_dc_avx) void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 ); #define x264_sub8x16_dct_dc_avx512 x264_template(sub8x16_dct_dc_avx512) void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 ); #define x264_add4x4_idct_mmx x264_template(add4x4_idct_mmx) void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] ); #define x264_add4x4_idct_sse2 x264_template(add4x4_idct_sse2) void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] ); #define x264_add4x4_idct_sse4 x264_template(add4x4_idct_sse4) void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] ); #define x264_add4x4_idct_avx x264_template(add4x4_idct_avx) void x264_add4x4_idct_avx ( pixel *p_dst, dctcoef dct [16] ); #define x264_add8x8_idct_mmx x264_template(add8x8_idct_mmx) void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] ); #define x264_add8x8_idct_dc_mmx2 x264_template(add8x8_idct_dc_mmx2) void x264_add8x8_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [ 4] ); #define x264_add16x16_idct_mmx x264_template(add16x16_idct_mmx) void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] ); #define x264_add16x16_idct_dc_mmx2 x264_template(add16x16_idct_dc_mmx2) void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] ); #define x264_add8x8_idct_sse2 x264_template(add8x8_idct_sse2) void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] ); #define x264_add8x8_idct_avx x264_template(add8x8_idct_avx) void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] ); #define x264_add8x8_idct_avx2 x264_template(add8x8_idct_avx2) void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] ); #define x264_add8x8_idct_avx512 x264_template(add8x8_idct_avx512) void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] ); #define x264_add16x16_idct_sse2 x264_template(add16x16_idct_sse2) void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] ); #define x264_add16x16_idct_avx x264_template(add16x16_idct_avx) void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] ); #define x264_add16x16_idct_avx2 x264_template(add16x16_idct_avx2) void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] ); #define x264_add8x8_idct_dc_sse2 x264_template(add8x8_idct_dc_sse2) void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] ); #define x264_add16x16_idct_dc_sse2 x264_template(add16x16_idct_dc_sse2) void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] ); #define x264_add8x8_idct_dc_ssse3 x264_template(add8x8_idct_dc_ssse3) void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] ); #define x264_add16x16_idct_dc_ssse3 x264_template(add16x16_idct_dc_ssse3) void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] ); #define x264_add8x8_idct_dc_avx x264_template(add8x8_idct_dc_avx) void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] ); #define x264_add16x16_idct_dc_avx x264_template(add16x16_idct_dc_avx) void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] ); #define x264_add16x16_idct_dc_avx2 x264_template(add16x16_idct_dc_avx2) void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] ); #define x264_dct4x4dc_mmx2 x264_template(dct4x4dc_mmx2) void x264_dct4x4dc_mmx2 ( int16_t d[16] ); #define x264_dct4x4dc_sse2 x264_template(dct4x4dc_sse2) void x264_dct4x4dc_sse2 ( int32_t d[16] ); #define x264_dct4x4dc_avx x264_template(dct4x4dc_avx) void x264_dct4x4dc_avx ( int32_t d[16] ); #define x264_idct4x4dc_mmx x264_template(idct4x4dc_mmx) void x264_idct4x4dc_mmx ( int16_t d[16] ); #define x264_idct4x4dc_sse2 x264_template(idct4x4dc_sse2) void x264_idct4x4dc_sse2 ( int32_t d[16] ); #define x264_idct4x4dc_avx x264_template(idct4x4dc_avx) void x264_idct4x4dc_avx ( int32_t d[16] ); #define x264_dct2x4dc_mmx2 x264_template(dct2x4dc_mmx2) void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] ); #define x264_dct2x4dc_sse2 x264_template(dct2x4dc_sse2) void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] ); #define x264_dct2x4dc_avx x264_template(dct2x4dc_avx) void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] ); #define x264_sub8x8_dct8_mmx x264_template(sub8x8_dct8_mmx) void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct8_mmx x264_template(sub16x16_dct8_mmx) void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct8_sse2 x264_template(sub8x8_dct8_sse2) void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 ); #define x264_sub16x16_dct8_sse2 x264_template(sub16x16_dct8_sse2) void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); #define x264_sub8x8_dct8_ssse3 x264_template(sub8x8_dct8_ssse3) void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub16x16_dct8_ssse3 x264_template(sub16x16_dct8_ssse3) void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); #define x264_sub8x8_dct8_sse4 x264_template(sub8x8_dct8_sse4) void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix2 ); #define x264_sub16x16_dct8_sse4 x264_template(sub16x16_dct8_sse4) void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 ); #define x264_sub8x8_dct8_avx x264_template(sub8x8_dct8_avx) void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 ); #define x264_sub16x16_dct8_avx x264_template(sub16x16_dct8_avx) void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); #define x264_sub16x16_dct8_avx2 x264_template(sub16x16_dct8_avx2) void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ); #define x264_add8x8_idct8_mmx x264_template(add8x8_idct8_mmx) void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] ); #define x264_add16x16_idct8_mmx x264_template(add16x16_idct8_mmx) void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] ); #define x264_add8x8_idct8_sse2 x264_template(add8x8_idct8_sse2) void x264_add8x8_idct8_sse2 ( pixel *dst, dctcoef dct [64] ); #define x264_add16x16_idct8_sse2 x264_template(add16x16_idct8_sse2) void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] ); #define x264_add8x8_idct8_avx x264_template(add8x8_idct8_avx) void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] ); #define x264_add16x16_idct8_avx x264_template(add16x16_idct8_avx) void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] ); #define x264_zigzag_scan_8x8_frame_mmx2 x264_template(zigzag_scan_8x8_frame_mmx2) void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_scan_8x8_frame_sse2 x264_template(zigzag_scan_8x8_frame_sse2) void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] ); #define x264_zigzag_scan_8x8_frame_ssse3 x264_template(zigzag_scan_8x8_frame_ssse3) void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_scan_8x8_frame_avx x264_template(zigzag_scan_8x8_frame_avx) void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); #define x264_zigzag_scan_8x8_frame_xop x264_template(zigzag_scan_8x8_frame_xop) void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_scan_8x8_frame_avx512 x264_template(zigzag_scan_8x8_frame_avx512) void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] ); #define x264_zigzag_scan_4x4_frame_mmx x264_template(zigzag_scan_4x4_frame_mmx) void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_4x4_frame_sse2 x264_template(zigzag_scan_4x4_frame_sse2) void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); #define x264_zigzag_scan_4x4_frame_ssse3 x264_template(zigzag_scan_4x4_frame_ssse3) void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_4x4_frame_avx x264_template(zigzag_scan_4x4_frame_avx) void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); #define x264_zigzag_scan_4x4_frame_xop x264_template(zigzag_scan_4x4_frame_xop) void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); #define x264_zigzag_scan_4x4_frame_avx512 x264_template(zigzag_scan_4x4_frame_avx512) void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] ); #define x264_zigzag_scan_4x4_field_sse x264_template(zigzag_scan_4x4_field_sse) void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); #define x264_zigzag_scan_4x4_field_sse2 x264_template(zigzag_scan_4x4_field_sse2) void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); #define x264_zigzag_scan_4x4_field_avx512 x264_template(zigzag_scan_4x4_field_avx512) void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] ); #define x264_zigzag_scan_8x8_field_mmx2 x264_template(zigzag_scan_8x8_field_mmx2) void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_scan_8x8_field_sse4 x264_template(zigzag_scan_8x8_field_sse4) void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); #define x264_zigzag_scan_8x8_field_avx x264_template(zigzag_scan_8x8_field_avx) void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); #define x264_zigzag_scan_8x8_field_xop x264_template(zigzag_scan_8x8_field_xop) void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); #define x264_zigzag_scan_8x8_field_avx512 x264_template(zigzag_scan_8x8_field_avx512) void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] ); #define x264_zigzag_sub_4x4_frame_avx x264_template(zigzag_sub_4x4_frame_avx) int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst ); #define x264_zigzag_sub_4x4_frame_ssse3 x264_template(zigzag_sub_4x4_frame_ssse3) int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); #define x264_zigzag_sub_4x4ac_frame_avx x264_template(zigzag_sub_4x4ac_frame_avx) int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); #define x264_zigzag_sub_4x4ac_frame_ssse3 x264_template(zigzag_sub_4x4ac_frame_ssse3) int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); #define x264_zigzag_sub_4x4_field_avx x264_template(zigzag_sub_4x4_field_avx) int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst ); #define x264_zigzag_sub_4x4_field_ssse3 x264_template(zigzag_sub_4x4_field_ssse3) int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); #define x264_zigzag_sub_4x4ac_field_avx x264_template(zigzag_sub_4x4ac_field_avx) int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); #define x264_zigzag_sub_4x4ac_field_ssse3 x264_template(zigzag_sub_4x4ac_field_ssse3) int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); #define x264_zigzag_interleave_8x8_cavlc_mmx x264_template(zigzag_interleave_8x8_cavlc_mmx) void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); #define x264_zigzag_interleave_8x8_cavlc_sse2 x264_template(zigzag_interleave_8x8_cavlc_sse2) void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #define x264_zigzag_interleave_8x8_cavlc_avx x264_template(zigzag_interleave_8x8_cavlc_avx) void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #define x264_zigzag_interleave_8x8_cavlc_avx2 x264_template(zigzag_interleave_8x8_cavlc_avx2) void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz ); #define x264_zigzag_interleave_8x8_cavlc_avx512 x264_template(zigzag_interleave_8x8_cavlc_avx512) void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #endif x264-master/common/x86/deblock-a.asm000066400000000000000000001750141502133446700173270ustar00rootroot00000000000000;***************************************************************************** ;* deblock-a.asm: x86 deblocking ;***************************************************************************** ;* Copyright (C) 2005-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 64 load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5 dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9 dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5 dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9 load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15 SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pb_a1 cextern pw_2 cextern pw_4 cextern pw_00ff cextern pw_pixel_max cextern pb_unpackbd1 %if HIGH_BIT_DEPTH ; out: %4 = |%1-%2|-%3 ; clobbers: %5 %macro ABS_SUB 5 psubusw %5, %2, %1 psubusw %4, %1, %2 por %4, %5 psubw %4, %3 %endmacro ; out: %4 = |%1-%2|<%3 %macro DIFF_LT 5 psubusw %4, %2, %1 psubusw %5, %1, %2 por %5, %4 ; |%1-%2| pxor %4, %4 psubw %5, %3 ; |%1-%2|-%3 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 %endmacro %macro LOAD_AB 4 movd %1, %3 movd %2, %4 SPLATW %1, %1 SPLATW %2, %2 %endmacro ; in: %2=tc reg ; out: %1=splatted tc %macro LOAD_TC 2 %if mmsize == 8 pshufw %1, [%2-1], 0 %else movd %1, [%2] punpcklbw %1, %1 pshuflw %1, %1, q1100 pshufd %1, %1, q1100 %endif psraw %1, 8 %endmacro ; in: %1=p1, %2=p0, %3=q0, %4=q1 ; %5=alpha, %6=beta, %7-%9=tmp ; out: %7=mask %macro LOAD_MASK 9 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta pand %8, %9 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta pxor %7, %7 pand %8, %9 pcmpgtw %7, %8 %endmacro ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', m2=q0' %macro DEBLOCK_P0_Q0 7 psubw %3, %4 pxor %7, %7 paddw %3, [pw_4] psubw %7, %5 psubw %6, %2, %1 psllw %6, 2 paddw %3, %6 psraw %3, 3 mova %6, [pw_pixel_max] CLIPW %3, %7, %5 pxor %7, %7 paddw %1, %3 psubw %2, %3 CLIPW %1, %7, %6 CLIPW %2, %7, %6 %endmacro ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp %macro LUMA_Q1 6 pavgw %6, %3, %4 ; (p0+q0+1)>>1 paddw %1, %6 pxor %6, %6 psraw %1, 1 psubw %6, %5 psubw %1, %2 CLIPW %1, %6, %5 paddw %1, %2 %endmacro %macro LUMA_DEBLOCK_ONE 3 DIFF_LT m5, %1, bm, m4, m6 pxor m6, m6 mova %3, m4 pcmpgtw m6, tcm pand m4, tcm pandn m6, m7 pand m4, m6 LUMA_Q1 m5, %2, m1, m2, m4, m6 %endmacro %macro LUMA_H_STORE 2 %if mmsize == 8 movq [r0-4], m0 movq [r0+r1-4], m1 movq [r0+r1*2-4], m2 movq [r0+%2-4], m3 %else movq [r0-4], m0 movhps [r0+r1-4], m0 movq [r0+r1*2-4], m1 movhps [%1-4], m1 movq [%1+r1-4], m2 movhps [%1+r1*2-4], m2 movq [%1+%2-4], m3 movhps [%1+r1*4-4], m3 %endif %endmacro %macro DEBLOCK_LUMA 0 ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_luma, 5,5,8,0-5*mmsize %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] %define am [rsp+mmsize*3] %define bm [rsp+mmsize*4] add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, 32/mmsize mov r2, r0 sub r0, r1 mova am, m4 sub r0, r1 mova bm, m5 sub r0, r1 .loop: mova m0, [r0+r1] mova m1, [r0+r1*2] mova m2, [r2] mova m3, [r2+r1] LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 LOAD_TC m6, r4 mova tcm, m6 mova m5, [r0] LUMA_DEBLOCK_ONE m1, m0, ms1 mova [r0+r1], m5 mova m5, [r2+r1*2] LUMA_DEBLOCK_ONE m2, m3, ms2 mova [r2+r1], m5 pxor m5, m5 mova m6, tcm pcmpgtw m5, tcm psubw m6, ms1 pandn m5, m7 psubw m6, ms2 pand m5, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 mova [r0+r1*2], m1 mova [r2], m2 add r0, mmsize add r2, mmsize add r4, mmsize/8 dec r3 jg .loop RET cglobal deblock_h_luma, 5,6,8,0-7*mmsize %define tcm [rsp] %define ms1 [rsp+mmsize] %define ms2 [rsp+mmsize*2] %define p1m [rsp+mmsize*3] %define p2m [rsp+mmsize*4] %define am [rsp+mmsize*5] %define bm [rsp+mmsize*6] add r1, r1 LOAD_AB m4, m5, r2d, r3d mov r3, r1 mova am, m4 add r3, r1 mov r5, 32/mmsize mova bm, m5 add r3, r1 %if mmsize == 16 mov r2, r0 add r2, r3 %endif .loop: %if mmsize == 8 movq m2, [r0-8] ; y q2 q1 q0 movq m7, [r0+0] movq m5, [r0+r1-8] movq m3, [r0+r1+0] movq m0, [r0+r1*2-8] movq m6, [r0+r1*2+0] movq m1, [r0+r3-8] TRANSPOSE4x4W 2, 5, 0, 1, 4 SWAP 2, 7 movq m7, [r0+r3] TRANSPOSE4x4W 2, 3, 6, 7, 4 %else movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x movu m0, [r0+r1-8] movu m2, [r0+r1*2-8] movu m3, [r2-8] TRANSPOSE4x4W 5, 0, 2, 3, 6 mova tcm, m3 movu m4, [r2+r1-8] movu m1, [r2+r1*2-8] movu m3, [r2+r3-8] movu m7, [r2+r1*4-8] TRANSPOSE4x4W 4, 1, 3, 7, 6 mova m6, tcm punpcklqdq m6, m7 punpckhqdq m5, m4 SBUTTERFLY qdq, 0, 1, 7 SBUTTERFLY qdq, 2, 3, 7 %endif mova p2m, m6 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 LOAD_TC m6, r4 mova tcm, m6 LUMA_DEBLOCK_ONE m1, m0, ms1 mova p1m, m5 mova m5, p2m LUMA_DEBLOCK_ONE m2, m3, ms2 mova p2m, m5 pxor m5, m5 mova m6, tcm pcmpgtw m5, tcm psubw m6, ms1 pandn m5, m7 psubw m6, ms2 pand m5, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 mova m0, p1m mova m3, p2m TRANSPOSE4x4W 0, 1, 2, 3, 4 LUMA_H_STORE r2, r3 add r4, mmsize/8 lea r0, [r0+r1*(mmsize/2)] lea r2, [r2+r1*(mmsize/2)] dec r5 jg .loop RET %endmacro %if ARCH_X86_64 ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 ; m12=alpha, m13=beta ; out: m0=p1', m3=q1', m1=p0', m2=q0' ; clobbers: m4, m5, m6, m7, m10, m11, m14 %macro DEBLOCK_LUMA_INTER_SSE2 0 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 LOAD_TC m6, r4 DIFF_LT m8, m1, m13, m10, m4 DIFF_LT m9, m2, m13, m11, m4 pand m6, m7 mova m14, m6 pxor m4, m4 pcmpgtw m6, m4 pand m6, m14 mova m5, m10 pand m5, m6 LUMA_Q1 m8, m0, m1, m2, m5, m4 mova m5, m11 pand m5, m6 LUMA_Q1 m9, m3, m1, m2, m5, m4 pxor m4, m4 psubw m6, m10 pcmpgtw m4, m14 pandn m4, m7 psubw m6, m11 pand m4, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 SWAP 0, 8 SWAP 3, 9 %endmacro %macro DEBLOCK_LUMA_64 0 cglobal deblock_v_luma, 5,5,15 %define p2 m8 %define p1 m0 %define p0 m1 %define q0 m2 %define q1 m3 %define q2 m9 %define mask0 m7 %define mask1 m10 %define mask2 m11 add r1, r1 LOAD_AB m12, m13, r2d, r3d mov r2, r0 sub r0, r1 sub r0, r1 sub r0, r1 mov r3, 2 .loop: mova p2, [r0] mova p1, [r0+r1] mova p0, [r0+r1*2] mova q0, [r2] mova q1, [r2+r1] mova q2, [r2+r1*2] DEBLOCK_LUMA_INTER_SSE2 mova [r0+r1], p1 mova [r0+r1*2], p0 mova [r2], q0 mova [r2+r1], q1 add r0, mmsize add r2, mmsize add r4, 2 dec r3 jg .loop RET cglobal deblock_h_luma, 5,7,15 add r1, r1 LOAD_AB m12, m13, r2d, r3d mov r2, r1 add r2, r1 add r2, r1 mov r5, r0 add r5, r2 mov r6, 2 .loop: movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x movu m0, [r0+r1-8] movu m2, [r0+r1*2-8] movu m9, [r5-8] movu m5, [r5+r1-8] movu m1, [r5+r1*2-8] movu m3, [r5+r2-8] movu m7, [r5+r1*4-8] TRANSPOSE4x4W 8, 0, 2, 9, 10 TRANSPOSE4x4W 5, 1, 3, 7, 10 punpckhqdq m8, m5 SBUTTERFLY qdq, 0, 1, 10 SBUTTERFLY qdq, 2, 3, 10 punpcklqdq m9, m7 DEBLOCK_LUMA_INTER_SSE2 TRANSPOSE4x4W 0, 1, 2, 3, 4 LUMA_H_STORE r5, r2 add r4, 2 lea r0, [r0+r1*8] lea r5, [r5+r1*8] dec r6 jg .loop RET %endmacro INIT_XMM sse2 DEBLOCK_LUMA_64 INIT_XMM avx DEBLOCK_LUMA_64 %endif %macro SWAPMOVA 2 %ifnum sizeof%1 SWAP %1, %2 %else mova %1, %2 %endif %endmacro ; in: t0-t2: tmp registers ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' %macro LUMA_INTRA_P012 12 ; p0..p3 in memory %if ARCH_X86_64 paddw t0, %3, %2 mova t2, %4 paddw t2, %3 %else mova t0, %3 mova t2, %4 paddw t0, %2 paddw t2, %3 %endif paddw t0, %1 paddw t2, t2 paddw t0, %5 paddw t2, %9 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) psrlw t2, 3 psrlw t1, t0, 2 psubw t2, %3 psubw t1, %2 pand t2, %8 pand t1, %8 paddw t2, %3 paddw t1, %2 SWAPMOVA %11, t1 psubw t1, t0, %3 paddw t0, t0 psubw t1, %5 psubw t0, %3 paddw t1, %6 paddw t1, %2 paddw t0, %6 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 pxor t0, t1 pxor t1, %1 pand t0, %8 pand t1, %7 pxor t0, t1 pxor t0, %1 SWAPMOVA %10, t0 SWAPMOVA %12, t2 %endmacro %macro LUMA_INTRA_INIT 1 %define t0 m4 %define t1 m5 %define t2 m6 %define t3 m7 %assign i 4 %rep %1 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] %assign i i+1 %endrep add r1, r1 %endmacro ; in: %1-%3=tmp, %4=p2, %5=q2 %macro LUMA_INTRA_INTER 5 LOAD_AB t0, t1, r2d, r3d mova %1, t0 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 %if ARCH_X86_64 mova %2, t0 ; mask0 psrlw t3, %1, 2 %else mova t3, %1 mova %2, t0 ; mask0 psrlw t3, 2 %endif paddw t3, [pw_2] ; alpha/4+2 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 pand t2, %2 mova t3, %5 ; q2 mova %1, t2 ; mask1 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta pand t2, %1 mova t3, %4 ; p2 mova %3, t2 ; mask1q DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta pand t2, %1 mova %1, t2 ; mask1p %endmacro %macro LUMA_H_INTRA_LOAD 0 %if mmsize == 8 movu t0, [r0-8] movu t1, [r0+r1-8] movu m0, [r0+r1*2-8] movu m1, [r0+r4-8] TRANSPOSE4x4W 4, 5, 0, 1, 2 mova t4, t0 ; p3 mova t5, t1 ; p2 movu m2, [r0] movu m3, [r0+r1] movu t0, [r0+r1*2] movu t1, [r0+r4] TRANSPOSE4x4W 2, 3, 4, 5, 6 mova t6, t0 ; q2 mova t7, t1 ; q3 %else movu t0, [r0-8] movu t1, [r0+r1-8] movu m0, [r0+r1*2-8] movu m1, [r0+r5-8] movu m2, [r4-8] movu m3, [r4+r1-8] movu t2, [r4+r1*2-8] movu t3, [r4+r5-8] TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 mova t4, t0 ; p3 mova t5, t1 ; p2 mova t6, t2 ; q2 mova t7, t3 ; q3 %endif %endmacro ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp %macro LUMA_H_INTRA_STORE 9 %if mmsize == 8 TRANSPOSE4x4W %1, %2, %3, %4, %9 movq [r0-8], m%1 movq [r0+r1-8], m%2 movq [r0+r1*2-8], m%3 movq [r0+r4-8], m%4 movq m%1, %8 TRANSPOSE4x4W %5, %6, %7, %1, %9 movq [r0], m%5 movq [r0+r1], m%6 movq [r0+r1*2], m%7 movq [r0+r4], m%1 %else TRANSPOSE2x4x4W %1, %2, %3, %4, %9 movq [r0-8], m%1 movq [r0+r1-8], m%2 movq [r0+r1*2-8], m%3 movq [r0+r5-8], m%4 movhps [r4-8], m%1 movhps [r4+r1-8], m%2 movhps [r4+r1*2-8], m%3 movhps [r4+r5-8], m%4 %ifnum %8 SWAP %1, %8 %else mova m%1, %8 %endif TRANSPOSE2x4x4W %5, %6, %7, %1, %9 movq [r0], m%5 movq [r0+r1], m%6 movq [r0+r1*2], m%7 movq [r0+r5], m%1 movhps [r4], m%5 movhps [r4+r1], m%6 movhps [r4+r1*2], m%7 movhps [r4+r5], m%1 %endif %endmacro %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA_INTRA_64 0 cglobal deblock_v_luma_intra, 4,7,16 %define t0 m1 %define t1 m2 %define t2 m4 %define p2 m8 %define p1 m9 %define p0 m10 %define q0 m11 %define q1 m12 %define q2 m13 %define aa m5 %define bb m14 add r1, r1 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride neg r4 add r4, r0 ; pix-4*stride mov r6, 2 mova m0, [pw_2] LOAD_AB aa, bb, r2d, r3d .loop: mova p2, [r4+r1] mova p1, [r4+2*r1] mova p0, [r4+r5] mova q0, [r0] mova q1, [r0+r1] mova q2, [r0+2*r1] LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 mova t2, aa psrlw t2, 2 paddw t2, m0 ; alpha/4+2 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta pand m6, m3 pand m7, m6 pand m6, t1 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] add r0, mmsize add r4, mmsize dec r6 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,7,16 %define t0 m15 %define t1 m14 %define t2 m2 %define q3 m5 %define q2 m8 %define q1 m9 %define q0 m10 %define p0 m11 %define p1 m12 %define p2 m13 %define p3 m4 %define spill [rsp] %assign pad 24-(stack_offset&15) SUB rsp, pad add r1, r1 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride add r4, r0 ; pix+4*stride mov r6, 2 mova m0, [pw_2] .loop: movu q3, [r0-8] movu q2, [r0+r1-8] movu q1, [r0+r1*2-8] movu q0, [r0+r5-8] movu p0, [r4-8] movu p1, [r4+r1-8] movu p2, [r4+r1*2-8] movu p3, [r4+r5-8] TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 LOAD_AB m1, m2, r2d, r3d LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 psrlw m1, 2 paddw m1, m0 ; alpha/4+2 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta pand m6, m3 pand m7, m6 pand m6, t1 mova spill, q3 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 mova m7, spill LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 lea r0, [r0+r1*8] lea r4, [r4+r1*8] dec r6 jg .loop ADD rsp, pad RET %endmacro INIT_XMM sse2 DEBLOCK_LUMA_INTRA_64 INIT_XMM avx DEBLOCK_LUMA_INTRA_64 %endif %macro DEBLOCK_LUMA_INTRA 0 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize LUMA_INTRA_INIT 3 lea r4, [r1*4] lea r5, [r1*3] neg r4 add r4, r0 mov r6, 32/mmsize .loop: mova m0, [r4+r1*2] ; p1 mova m1, [r4+r5] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] mova t3, [r0+r1*2] ; q2 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] add r0, mmsize add r4, mmsize dec r6 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize LUMA_INTRA_INIT 8 %if mmsize == 8 lea r4, [r1*3] mov r5, 32/mmsize %else lea r4, [r1*4] lea r5, [r1*3] ; 3*stride add r4, r0 ; pix+4*stride mov r6, 32/mmsize %endif .loop: LUMA_H_INTRA_LOAD LUMA_INTRA_INTER t8, t9, t10, t5, t6 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 mova t3, t6 ; q2 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 mova m2, t4 mova m0, t11 mova m1, t5 mova m3, t8 mova m6, t6 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 lea r0, [r0+r1*(mmsize/2)] %if mmsize == 8 dec r5 %else lea r4, [r4+r1*(mmsize/2)] dec r6 %endif jg .loop RET %endmacro %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA DEBLOCK_LUMA_INTRA INIT_XMM avx DEBLOCK_LUMA DEBLOCK_LUMA_INTRA %endif %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] %define PASS8ROWS(base, base3, stride, stride3, offset) \ PASS8ROWS(base+offset, base3+offset, stride, stride3) ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 %macro TRANSPOSE8x4B_STORE 8 punpckhdq m4, m0, m0 punpckhdq m5, m1, m1 punpckhdq m6, m2, m2 punpcklbw m0, m1 punpcklbw m2, m3 punpcklwd m1, m0, m2 punpckhwd m0, m2 movd %1, m1 punpckhdq m1, m1 movd %2, m1 movd %3, m0 punpckhdq m0, m0 movd %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 punpcklwd m5, m4, m6 punpckhwd m4, m6 movd %5, m5 punpckhdq m5, m5 movd %6, m5 movd %7, m4 punpckhdq m4, m4 movd %8, m4 %endmacro ; in: 8 rows of 4 bytes in %9..%10 ; out: 8 rows of 4 bytes in %1..%8 %macro STORE_8x4B 10 movd %1, %9 pextrd %2, %9, 1 pextrd %3, %9, 2 pextrd %4, %9, 3 movd %5, %10 pextrd %6, %10, 1 pextrd %7, %10, 2 pextrd %8, %10, 3 %endmacro ; in: 4 rows of 4 words in %1..%4 ; out: 4 rows of 4 word in m0..m3 ; clobbers: m4 %macro TRANSPOSE4x4W_LOAD 4-8 %if mmsize==8 SWAP 1, 4, 2, 3 movq m0, %1 movq m1, %2 movq m2, %3 movq m3, %4 TRANSPOSE4x4W 0, 1, 2, 3, 4 %else movq m0, %1 movq m2, %2 movq m1, %3 movq m3, %4 punpcklwd m0, m2 punpcklwd m1, m3 mova m2, m0 punpckldq m0, m1 punpckhdq m2, m1 MOVHL m1, m0 MOVHL m3, m2 %endif %endmacro ; in: 2 rows of 4 words in m1..m2 ; out: 4 rows of 2 words in %1..%4 ; clobbers: m0, m1 %macro TRANSPOSE4x2W_STORE 4-8 %if mmsize==8 punpckhwd m0, m1, m2 punpcklwd m1, m2 %else punpcklwd m1, m2 MOVHL m0, m1 %endif movd %3, m0 movd %1, m1 psrlq m1, 32 psrlq m0, 32 movd %2, m1 movd %4, m0 %endmacro ; in: 4/8 rows of 4 words in %1..%8 ; out: 4 rows of 4/8 word in m0..m3 ; clobbers: m4, m5, m6, m7 %macro TRANSPOSE4x8W_LOAD 8 %if mmsize==8 TRANSPOSE4x4W_LOAD %1, %2, %3, %4 %else movq m0, %1 movq m2, %2 movq m1, %3 movq m3, %4 punpcklwd m0, m2 punpcklwd m1, m3 punpckhdq m2, m0, m1 punpckldq m0, m1 movq m4, %5 movq m6, %6 movq m5, %7 movq m7, %8 punpcklwd m4, m6 punpcklwd m5, m7 punpckhdq m6, m4, m5 punpckldq m4, m5 punpckhqdq m1, m0, m4 punpckhqdq m3, m2, m6 punpcklqdq m0, m4 punpcklqdq m2, m6 %endif %endmacro ; in: 2 rows of 4/8 words in m1..m2 ; out: 4/8 rows of 2 words in %1..%8 ; clobbers: m0, m1 %macro TRANSPOSE8x2W_STORE 8 %if mmsize==8 TRANSPOSE4x2W_STORE %1, %2, %3, %4 %else punpckhwd m0, m1, m2 punpcklwd m1, m2 movd %5, m0 movd %1, m1 psrldq m1, 4 psrldq m0, 4 movd %2, m1 movd %6, m0 psrldq m1, 4 psrldq m0, 4 movd %3, m1 movd %7, m0 psrldq m1, 4 psrldq m0, 4 movd %4, m1 movd %8, m0 %endif %endmacro %macro SBUTTERFLY3 4 punpckh%1 %4, %2, %3 punpckl%1 %2, %3 %endmacro ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 RESET_MM_PERMUTATION %if cpuflag(avx) ; input: ; _ABCDEF_ ; _GHIJKL_ ; _MNOPQR_ ; _STUVWX_ ; _YZabcd_ ; _efghij_ ; _klmnop_ ; _qrstuv_ movh m0, %1 movh m2, %2 movh m1, %3 movh m3, %4 punpcklbw m0, m2 ; __ AG BH CI DJ EK FL __ punpcklbw m1, m3 ; __ MS NT OU PV QW RX __ movh m2, %5 movh m3, %6 punpcklbw m2, m3 ; __ Ye Zf ag bh ci dj __ movh m3, %7 movh m4, %8 punpcklbw m3, m4 ; __ kq lr ms nt ou pv __ SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU ; DJ PV EK QW FL RX __ __ SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms ; bh nt ci ou dj pv __ __ SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq ; BH NT Zf lr CI FL OU RX SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr ; FL RX dj pv __ __ __ __ movhps [%9+0x00], m0 movh [%9+0x10], m2 movhps [%9+0x20], m2 movh [%9+0x30], m1 movhps [%9+0x40], m1 movh [%9+0x50], m3 %else movq m0, %1 movq m1, %2 movq m2, %3 movq m3, %4 movq m4, %5 movq m5, %6 movq m6, %7 SBUTTERFLY bw, 0, 1, 7 SBUTTERFLY bw, 2, 3, 7 SBUTTERFLY bw, 4, 5, 7 movq [%9+0x10], m3 SBUTTERFLY3 bw, m6, %8, m7 SBUTTERFLY wd, 0, 2, 3 SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 SBUTTERFLY3 wd, m1, [%9+0x10], m3 SBUTTERFLY wd, 5, 7, 0 SBUTTERFLY dq, 1, 5, 0 SBUTTERFLY dq, 2, 6, 0 punpckldq m3, m7 movq [%9+0x10], m2 movq [%9+0x20], m6 movq [%9+0x30], m1 movq [%9+0x40], m5 movq [%9+0x50], m3 %endif RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 RESET_MM_PERMUTATION %if cpuflag(avx) movh m0, %1 movh m4, %2 movh m1, %3 movh m5, %4 movh m2, %5 movh m3, %7 punpcklbw m0, m4 punpcklbw m1, m5 movh m4, %6 movh m5, %8 punpcklbw m2, m4 punpcklbw m3, m5 SBUTTERFLY wd, 0, 1, 4 SBUTTERFLY wd, 2, 3, 4 SBUTTERFLY dq, 0, 2, 4 SBUTTERFLY dq, 1, 3, 4 movh %9, m0 movhps %10, m0 movh %11, m2 movhps %12, m2 movh %13, m1 movhps %14, m1 movh %15, m3 movhps %16, m3 %else movq m0, %1 movq m1, %2 movq m2, %3 movq m3, %4 movq m4, %5 movq m5, %6 movq m6, %7 SBUTTERFLY bw, 0, 1, 7 SBUTTERFLY bw, 2, 3, 7 SBUTTERFLY bw, 4, 5, 7 SBUTTERFLY3 bw, m6, %8, m7 movq %9, m5 SBUTTERFLY wd, 0, 2, 5 SBUTTERFLY wd, 4, 6, 5 SBUTTERFLY wd, 1, 3, 5 movq %11, m6 movq m6, %9 SBUTTERFLY wd, 6, 7, 5 SBUTTERFLY dq, 0, 4, 5 SBUTTERFLY dq, 1, 6, 5 movq %9, m0 movq %10, m4 movq %13, m1 movq %14, m6 SBUTTERFLY3 dq, m2, %11, m0 SBUTTERFLY dq, 3, 7, 4 movq %11, m2 movq %12, m0 movq %15, m3 movq %16, m7 %endif RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 %if avx_enabled == 0 mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 %else psubusb %5, %2, %1 psubusb %4, %1, %2 %endif por %4, %5 psubusb %4, %3 %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5-6 %if %0<6 psubusb %4, %1, %2 psubusb %5, %2, %1 %else mova %4, %1 mova %5, %2 psubusb %4, %2 psubusb %5, %1 %endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha %2=beta ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 %if cpuflag(ssse3) movd m4, %1 movd m5, %2 pxor m6, m6 pshufb m4, m6 pshufb m5, m6 %else movd m4, %1 movd m5, %2 punpcklbw m4, m4 punpcklbw m5, m5 SPLATW m4, m4 SPLATW m5, m5 %endif mova m6, [pb_1] psubusb m4, m6 ; alpha - 1 psubusb m5, m6 ; beta - 1 %if %0>2 mova %3, m4 %endif DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 por m7, m4 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 por m7, m4 pxor m6, m6 pcmpeqb m7, m6 %endmacro ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 pxor m5, m1, m2 ; p0^q0 pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 pavgb m3, m0 ; (p1 - q1 + 256)>>1 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 paddusb m3, m4 ; d+128+33 mova m6, [pb_a1] psubusb m6, m3 psubusb m3, [pb_a1] pminub m6, m7 pminub m3, m7 psubusb m1, m6 psubusb m2, m3 paddusb m1, m3 paddusb m2, m6 %endmacro ; in: m1=p0 m2=q0 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 pavgb %6, m1, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 psubusb %6, %1, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 mova %4, %2 %endmacro %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA 0 cglobal deblock_v_luma, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] neg r4 add r4, r0 ; pix-3*stride mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 LOAD_MASK r2d, r3d %if cpuflag(avx) pshufb m8, [pb_unpackbd1] pblendvb m9, m7, m6, m8 %else punpcklbw m8, m8 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] pcmpeqb m9, m9 pcmpeqb m9, m8 pandn m9, m7 %endif pand m8, m9 mova m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 psubb m7, m8, m6 ; tc++ pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 pand m6, m9 pand m8, m6 psubb m7, m6 mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %if cpuflag(avx) INIT_XMM cpuname %else INIT_MMX cpuname %endif cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] %xdefine pix_tmp rsp+0x30*WIN64 ; shadow space + r4 ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp lea r6, [r6+r1*8] lea r5, [r5+r1*8] TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them mov r7, r1 lea r0, [pix_tmp+0x30] mov r1d, 0x10 %if WIN64 mov [rsp+0x20], r4 %endif call deblock_v_luma ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 add r5, 2 %if cpuflag(sse4) mova m0, [pix_tmp+0x10] mova m1, [pix_tmp+0x20] mova m2, [pix_tmp+0x30] mova m3, [pix_tmp+0x40] SBUTTERFLY bw, 0, 1, 4 SBUTTERFLY bw, 2, 3, 4 SBUTTERFLY wd, 0, 2, 4 SBUTTERFLY wd, 1, 3, 4 STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m1, m3 shl r7, 3 sub r6, r7 sub r5, r7 shr r7, 3 STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m0, m2 %else movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) shl r7, 3 sub r6, r7 sub r5, r7 shr r7, 3 movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) %endif RET %endmacro INIT_XMM sse2 DEBLOCK_LUMA INIT_XMM avx DEBLOCK_LUMA %else %macro DEBLOCK_LUMA 2 ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_%1_luma, 5,5,8,2*%2 lea r4, [r1*3] neg r4 add r4, r0 ; pix-3*stride mova m0, [r4+r1] ; p1 mova m1, [r4+2*r1] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 LOAD_MASK r2d, r3d mov r3, r4mp movd m4, [r3] ; tc0 %if cpuflag(avx) pshufb m4, [pb_unpackbd1] mova [esp+%2], m4 ; tc pblendvb m4, m7, m6, m4 %else punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] mova [esp+%2], m4 ; tc pcmpeqb m3, m3 pcmpgtb m4, m3 pand m4, m7 %endif mova [esp], m4 ; mask mova m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%2] ; tc psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 mova m5, [esp] ; mask pand m6, m5 mova m5, [esp+%2] ; tc pand m5, m6 psubb m7, m6 mova m3, [r0+r1] LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 DEBLOCK_P0_Q0 mova [r4+2*r1], m1 mova [r0], m2 RET ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %if cpuflag(avx) INIT_XMM cpuname %else INIT_MMX cpuname %endif cglobal deblock_h_luma, 1,5,8,0x60+12 mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] %define pix_tmp esp+12 ; esp is intentionally misaligned to make it aligned after pushing the arguments for deblock_%1_luma. ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp lea r0, [r0+r3*8] lea r1, [r1+r3*8] TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 ; vertical filter lea r0, [pix_tmp+0x30] PUSH dword r4m PUSH dword r3m PUSH dword r2m PUSH dword 16 PUSH dword r0 call deblock_%1_luma %ifidn %1, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 call deblock_%1_luma %endif ADD esp, 20 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) mov r0, r0mp sub r0, 2 lea r1, [r0+r4] %if cpuflag(avx) mova m0, [pix_tmp+0x10] mova m1, [pix_tmp+0x20] mova m2, [pix_tmp+0x30] mova m3, [pix_tmp+0x40] SBUTTERFLY bw, 0, 1, 4 SBUTTERFLY bw, 2, 3, 4 SBUTTERFLY wd, 0, 2, 4 SBUTTERFLY wd, 1, 3, 4 STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m0, m2 lea r0, [r0+r3*8] lea r1, [r1+r3*8] STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m1, m3 %else movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) %endif RET %endmacro ; DEBLOCK_LUMA INIT_MMX mmx2 DEBLOCK_LUMA v8, 8 INIT_XMM sse2 DEBLOCK_LUMA v, 16 INIT_XMM avx DEBLOCK_LUMA v, 16 %endif ; ARCH %macro LUMA_INTRA_P012 4 ; p0..p3 in memory %if ARCH_X86_64 pavgb t0, p2, p1 pavgb t1, p0, q0 %else mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 %endif pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 %if ARCH_X86_64 paddb t2, p2, p1 paddb t3, p0, q0 %else mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 %endif paddb t2, t3 mova t3, t2 mova t4, t2 psrlw t2, 1 pavgb t2, mpb_0 pxor t2, t0 pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; %if ARCH_X86_64 pavgb t1, p2, q1 psubb t2, p2, q1 %else mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 %endif paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_1 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 pavgb t3, mpb_0 pxor t3, t1 pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 pxor t3, p0, q1 pavgb t2, p0, q1 pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 pxor t1, t2 pxor t2, p0 pand t1, mask1p pand t2, mask0 pxor t1, t2 pxor t1, p0 mova %1, t1 ; store p0 mova t1, %4 ; p3 paddb t2, t1, p2 pavgb t1, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 psrlw t2, 2 pavgb t2, mpb_0 pxor t2, t1 pand t2, mpb_1 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 pxor t0, p1 pxor t1, p2 pand t0, mask1p pand t1, mask1p pxor t0, p1 pxor t1, p2 mova %2, t0 ; store p1 mova %3, t1 ; store p2 %endmacro %macro LUMA_INTRA_SWAP_PQ 0 %define q1 m0 %define q0 m1 %define p0 m2 %define p1 m3 %define p2 q2 %define mask1p mask1q %endmacro %macro DEBLOCK_LUMA_INTRA 1 %define p1 m0 %define p0 m1 %define q0 m2 %define q1 m3 %define t0 m4 %define t1 m5 %define t2 m6 %define t3 m7 %if ARCH_X86_64 %define p2 m8 %define q2 m9 %define t4 m10 %define t5 m11 %define mask0 m12 %define mask1p m13 %if WIN64 %define mask1q [rsp] %else %define mask1q [rsp-24] %endif %define mpb_0 m14 %define mpb_1 m15 %else %define spill(x) [esp+16*x] %define p2 [r4+r1] %define q2 [r0+2*r1] %define t4 spill(0) %define t5 spill(1) %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) %define mpb_0 [pb_0] %define mpb_1 [pb_1] %endif ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10 lea r4, [r1*4] lea r5, [r1*3] ; 3*stride neg r4 add r4, r0 ; pix-4*stride mova p1, [r4+2*r1] mova p0, [r4+r5] mova q0, [r0] mova q1, [r0+r1] %if ARCH_X86_64 pxor mpb_0, mpb_0 mova mpb_1, [pb_1] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 pavgb t5, mpb_0 pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 DIFF_GT2 p0, p2, m5, t2, t5, 1 ; mask1 = |p2-p0| > beta-1 DIFF_GT2 q0, q2, m5, t4, t5, 1 ; t4 = |q2-q0| > beta-1 pand t0, mask0 pand t4, t0 pand t2, t0 mova mask1q, t4 mova mask1p, t2 %else LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 pavgb m4, [pb_0] pavgb m4, [pb_1] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7, 1 ; m4 = |p2-p0| > beta-1 pand m4, m6 mova mask1p, m4 DIFF_GT2 q0, q2, m5, m4, m7, 1 ; m4 = |q2-q0| > beta-1 pand m4, m6 mova mask1q, m4 %endif LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] .end: REP_RET %if cpuflag(avx) INIT_XMM cpuname %else INIT_MMX cpuname %endif %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,9,0,0x80 lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 %define pix_tmp rsp+0x20 ; shadow space %else %define pix_tmp rsp %endif ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) lea r6, [r6+r1*8] lea r5, [r5+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) mov r7, r1 lea r0, [pix_tmp+0x40] mov r1, 0x10 call deblock_v_luma_intra ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) shl r7, 3 sub r6, r7 sub r5, r7 shr r7, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) RET %else cglobal deblock_h_luma_intra, 2,4,8,0x80 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] %define pix_tmp rsp ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] PUSH dword r3m PUSH dword r2m PUSH dword 16 PUSH r0 call deblock_%1_luma_intra %ifidn %1, v8 add dword [rsp], 8 ; pix_tmp+8 call deblock_%1_luma_intra %endif ADD esp, 16 mov r1, r1m mov r0, r0mp lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) RET %endif ; ARCH_X86_64 %endmacro ; DEBLOCK_LUMA_INTRA INIT_XMM sse2 DEBLOCK_LUMA_INTRA v INIT_XMM avx DEBLOCK_LUMA_INTRA v %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_LUMA_INTRA v8 %endif %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp ; out: %1=p0', %2=q0' %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 mova %6, [pw_2] paddw %6, %3 paddw %6, %4 paddw %7, %6, %2 paddw %6, %1 paddw %6, %3 paddw %7, %4 psraw %6, 2 psraw %7, 2 psubw %6, %1 psubw %7, %2 pand %6, %5 pand %7, %5 paddw %1, %6 paddw %2, %7 %endmacro ; out: m0-m3 ; clobbers: m4-m7 %macro CHROMA_H_LOAD 0-1 movq m0, [r0-8] ; p1 p1 p0 p0 movq m2, [r0] ; q0 q0 q1 q1 movq m5, [r0+r1-8] movq m7, [r0+r1] %if mmsize == 8 mova m1, m0 mova m3, m2 punpckldq m0, m5 ; p1 punpckhdq m1, m5 ; p0 punpckldq m2, m7 ; q0 punpckhdq m3, m7 ; q1 %else movq m4, [r0+r1*2-8] movq m6, [r0+r1*2] movq m1, [r0+%1-8] movq m3, [r0+%1] punpckldq m0, m5 ; p1 ... p0 ... punpckldq m2, m7 ; q0 ... q1 ... punpckldq m4, m1 punpckldq m6, m3 punpckhqdq m1, m0, m4 ; p0 punpcklqdq m0, m4 ; p1 punpckhqdq m3, m2, m6 ; q1 punpcklqdq m2, m6 ; q0 %endif %endmacro %macro CHROMA_V_LOAD 1 mova m0, [r0] ; p1 mova m1, [r0+r1] ; p0 mova m2, [%1] ; q0 mova m3, [%1+r1] ; q1 %endmacro ; clobbers: m1, m2, m3 %macro CHROMA_H_STORE 0-1 SBUTTERFLY dq, 1, 2, 3 %if mmsize == 8 movq [r0-4], m1 movq [r0+r1-4], m2 %else movq [r0-4], m1 movq [r0+r1*2-4], m2 movhps [r0+r1-4], m1 movhps [r0+%1-4], m2 %endif %endmacro %macro CHROMA_V_STORE 0 mova [r0+1*r1], m1 mova [r0+2*r1], m2 %endmacro %macro DEBLOCK_CHROMA 0 cglobal deblock_inter_body LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 LOAD_TC m6, r4 pmaxsw m6, m4 pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma, 5,7,8 FIX_STRIDES r1 mov r5, r0 sub r0, r1 sub r0, r1 mov r6, 32/mmsize .loop: CHROMA_V_LOAD r5 call deblock_inter_body CHROMA_V_STORE add r0, mmsize add r5, mmsize add r4, mmsize/8 dec r6 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 add r1, r1 mov r5, 32/mmsize %if mmsize == 16 lea r6, [r1*3] %endif .loop: CHROMA_H_LOAD r6 call deblock_inter_body CHROMA_H_STORE r6 lea r0, [r0+r1*(mmsize/4)] add r4, mmsize/8 dec r5 jg .loop RET cglobal deblock_intra_body LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma_intra, 4,6,8 add r1, r1 mov r5, 32/mmsize movd m5, r3d mov r4, r0 sub r0, r1 sub r0, r1 SPLATW m5, m5 .loop: CHROMA_V_LOAD r4 call deblock_intra_body CHROMA_V_STORE add r0, mmsize add r4, mmsize dec r5 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 add r1, r1 mov r4, 32/mmsize %if mmsize == 16 lea r5, [r1*3] %endif .loop: CHROMA_H_LOAD r5 call deblock_intra_body CHROMA_H_STORE r5 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra_mbaff, 4,6,8 add r1, r1 %if mmsize == 8 mov r4, 16/mmsize .loop: %else lea r5, [r1*3] %endif CHROMA_H_LOAD r5 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r5 %if mmsize == 8 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop %endif RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_mbaff, 5,7,8 add r1, r1 lea r6, [r1*3] %if mmsize == 8 mov r5, 16/mmsize .loop: %endif CHROMA_H_LOAD r6 LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 movd m6, [r4] punpcklbw m6, m6 psraw m6, 8 punpcklwd m6, m6 pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r6 %if mmsize == 8 lea r0, [r0+r1*(mmsize/4)] add r4, mmsize/4 dec r5 jg .loop %endif RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_422_intra, 4,6,8 add r1, r1 mov r4, 64/mmsize %if mmsize == 16 lea r5, [r1*3] %endif .loop: CHROMA_H_LOAD r5 call deblock_intra_body CHROMA_H_STORE r5 lea r0, [r0+r1*(mmsize/4)] dec r4 jg .loop RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_422, 5,7,8 add r1, r1 mov r5, 64/mmsize lea r6, [r1*3] .loop: CHROMA_H_LOAD r6 LOAD_AB m4, m5, r2m, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 movd m6, [r4-1] psraw m6, 8 SPLATW m6, m6 pmaxsw m6, m4 pand m7, m6 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r6 lea r0, [r0+r1*(mmsize/4)] %if mmsize == 16 inc r4 %else mov r2, r5 and r2, 1 add r4, r2 ; increment once every 2 iterations %endif dec r5 jg .loop RET %endmacro ; DEBLOCK_CHROMA %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_CHROMA %endif INIT_XMM sse2 DEBLOCK_CHROMA INIT_XMM avx DEBLOCK_CHROMA %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 %macro CHROMA_V_START 0 mov t5, r0 sub t5, r1 sub t5, r1 %if mmsize==8 mov dword r0m, 2 .loop: %endif %endmacro %macro CHROMA_H_START 0 sub r0, 4 lea t6, [r1*3] mov t5, r0 add r0, t6 %endmacro %macro CHROMA_V_LOOP 1 %if mmsize==8 add r0, 8 add t5, 8 %if %1 add r4, 2 %endif dec dword r0m jg .loop %endif %endmacro %macro CHROMA_H_LOOP 1 %if mmsize==8 lea r0, [r0+r1*4] lea t5, [t5+r1*4] %if %1 add r4, 2 %endif dec dword r0m jg .loop %endif %endmacro %define t5 r5 %define t6 r6 %macro DEBLOCK_CHROMA 0 cglobal chroma_inter_body LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma, 5,6,8 CHROMA_V_START mova m0, [t5] mova m1, [t5+r1] mova m2, [r0] mova m3, [r0+r1] call chroma_inter_body mova [t5+r1], m1 mova [r0], m2 CHROMA_V_LOOP 1 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 CHROMA_H_START %if mmsize==8 mov dword r0m, 2 .loop: %endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_inter_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 1 RET %endmacro ; DEBLOCK_CHROMA INIT_XMM sse2 DEBLOCK_CHROMA INIT_XMM avx DEBLOCK_CHROMA %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_CHROMA %endif ;----------------------------------------------------------------------------- ; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_H_CHROMA_420_MBAFF 0 cglobal deblock_h_chroma_mbaff, 5,7,8 CHROMA_H_START TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) RET %endmacro INIT_XMM sse2 DEBLOCK_H_CHROMA_420_MBAFF %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEBLOCK_H_CHROMA_420_MBAFF %endif %macro DEBLOCK_H_CHROMA_422 0 cglobal deblock_h_chroma_422, 5,8,8 %if ARCH_X86_64 %define cntr r7 %else %define cntr dword r0m %endif CHROMA_H_START mov cntr, 32/mmsize .loop: TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 %if mmsize == 16 punpcklbw m6, m6 punpcklbw m6, m6 %else pshufw m6, m6, q0000 %endif pand m7, m6 DEBLOCK_P0_Q0 TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) lea r0, [r0+r1*(mmsize/2)] lea t5, [t5+r1*(mmsize/2)] add r4, mmsize/8 dec cntr jg .loop RET %endmacro INIT_MMX mmx2 DEBLOCK_H_CHROMA_422 INIT_XMM sse2 DEBLOCK_H_CHROMA_422 INIT_XMM avx DEBLOCK_H_CHROMA_422 ; in: %1=p0 %2=p1 %3=q1 ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 %macro CHROMA_INTRA_P0 3 pxor m4, %1, %3 pand m4, [pb_1] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) %endmacro %define t5 r4 %define t6 r5 %macro DEBLOCK_CHROMA_INTRA_BODY 0 cglobal chroma_intra_body LOAD_MASK r2d, r3d mova m5, m1 mova m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 psubb m2, m6 pand m1, m7 pand m2, m7 paddb m1, m5 paddb m2, m6 ret %endmacro %macro DEBLOCK_CHROMA_INTRA 0 ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma_intra, 4,5,8 CHROMA_V_START mova m0, [t5] mova m1, [t5+r1] mova m2, [r0] mova m3, [r0+r1] call chroma_intra_body mova [t5+r1], m1 mova [r0], m2 CHROMA_V_LOOP 0 RET ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 CHROMA_H_START %if mmsize==8 mov dword r0m, 2 .loop: %endif TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) CHROMA_H_LOOP 0 RET cglobal deblock_h_chroma_422_intra, 4,7,8 CHROMA_H_START mov r6d, 32/mmsize .loop: TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) lea r0, [r0+r1*(mmsize/2)] lea t5, [t5+r1*(mmsize/2)] dec r6d jg .loop RET %endmacro ; DEBLOCK_CHROMA_INTRA INIT_XMM sse2 DEBLOCK_CHROMA_INTRA_BODY DEBLOCK_CHROMA_INTRA INIT_XMM avx DEBLOCK_CHROMA_INTRA_BODY DEBLOCK_CHROMA_INTRA INIT_MMX mmx2 DEBLOCK_CHROMA_INTRA_BODY %if ARCH_X86_64 == 0 DEBLOCK_CHROMA_INTRA %endif ;----------------------------------------------------------------------------- ; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal deblock_h_chroma_intra_mbaff, 4,6,8 CHROMA_H_START TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2], ; uint8_t bs[2][4][4], int mvy_limit, int bframe ) ;----------------------------------------------------------------------------- %define scan8start (4+1*8) %define nnz r0+scan8start %define ref r1+scan8start %define mv r2+scan8start*4 %define bs0 r3 %define bs1 r3+32 %macro LOAD_BYTES_XMM 2 ; src, aligned %if %2 mova m2, [%1-4] mova m1, [%1+12] %else movu m2, [%1-4] movu m1, [%1+12] %endif psllq m0, m2, 8 shufps m2, m1, q3131 ; cur nnz, all rows psllq m1, 8 shufps m0, m1, q3131 ; left neighbors %if cpuflag(avx) || (%2 && cpuflag(ssse3)) palignr m1, m2, [%1-20], 12 %else pslldq m1, m2, 4 movd m3, [%1-8] por m1, m3 ; top neighbors %endif %endmacro %if UNIX64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 4 %endif %macro DEBLOCK_STRENGTH_XMM 0 cglobal deblock_strength, 5,5,7 ; Prepare mv comparison register shl r4d, 8 add r4d, 3 - (1<<8) movd m6, r4d movifnidn t0d, r5m SPLATW m6, m6 pxor m4, m4 ; bs0 pxor m5, m5 ; bs1 .lists: ; Check refs LOAD_BYTES_XMM ref, 0 pxor m0, m2 pxor m1, m2 por m4, m0 por m5, m1 ; Check mvs %if cpuflag(ssse3) && notcpuflag(avx) mova m0, [mv+4*8*0] mova m1, [mv+4*8*1] palignr m3, m0, [mv+4*8*0-16], 12 palignr m2, m1, [mv+4*8*1-16], 12 psubw m0, m3 psubw m1, m2 packsswb m0, m1 mova m2, [mv+4*8*2] mova m1, [mv+4*8*3] palignr m3, m2, [mv+4*8*2-16], 12 psubw m2, m3 palignr m3, m1, [mv+4*8*3-16], 12 psubw m1, m3 packsswb m2, m1 %else movu m0, [mv-4+4*8*0] movu m1, [mv-4+4*8*1] movu m2, [mv-4+4*8*2] movu m3, [mv-4+4*8*3] psubw m0, [mv+4*8*0] psubw m1, [mv+4*8*1] psubw m2, [mv+4*8*2] psubw m3, [mv+4*8*3] packsswb m0, m1 packsswb m2, m3 %endif ABSB m0, m1 ABSB m2, m3 psubusb m0, m6 psubusb m2, m6 packsswb m0, m2 por m4, m0 mova m0, [mv+4*8*-1] mova m1, [mv+4*8* 0] mova m2, [mv+4*8* 1] mova m3, [mv+4*8* 2] psubw m0, m1 psubw m1, m2 psubw m2, m3 psubw m3, [mv+4*8* 3] packsswb m0, m1 packsswb m2, m3 ABSB m0, m1 ABSB m2, m3 psubusb m0, m6 psubusb m2, m6 packsswb m0, m2 por m5, m0 add r1, 40 add r2, 4*8*5 dec t0d jge .lists ; Check nnz LOAD_BYTES_XMM nnz, 1 por m0, m2 por m1, m2 mova m6, [pb_1] pminub m0, m6 pminub m1, m6 pminub m4, m6 ; mv ? 1 : 0 pminub m5, m6 paddb m0, m0 ; nnz ? 2 : 0 paddb m1, m1 pmaxub m4, m0 pmaxub m5, m1 %if cpuflag(ssse3) pshufb m4, [transpose_shuf] %else movhlps m3, m4 punpcklbw m4, m3 movhlps m3, m4 punpcklbw m4, m3 %endif mova [bs1], m5 mova [bs0], m4 RET %endmacro INIT_XMM sse2 DEBLOCK_STRENGTH_XMM INIT_XMM ssse3 DEBLOCK_STRENGTH_XMM INIT_XMM avx DEBLOCK_STRENGTH_XMM %macro LOAD_BYTES_YMM 1 movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2 vpbroadcastd m2, [%1-8] ; ABCD .... vpblendd m0, m0, m2, 0x80 vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS %endmacro INIT_YMM avx2 cglobal deblock_strength, 5,5,8 mova m6, [load_bytes_ymm_shuf] ; Prepare mv comparison register shl r4d, 8 add r4d, 3 - (1<<8) movd xm5, r4d movifnidn t0d, r5m vpbroadcastw m5, xm5 psrld m7, m6, 4 pxor m4, m4 ; bs0,bs1 .lists: ; Check refs LOAD_BYTES_YMM ref pxor m0, m1 por m4, m0 ; Check mvs movu xm0, [mv+0*4*8-4] vinserti128 m0, m0, [mv-1*4*8 ], 1 vbroadcasti128 m2, [mv+0*4*8 ] vinserti128 m1, m2, [mv+1*4*8-4], 0 psubw m0, m2 vbroadcasti128 m2, [mv+1*4*8 ] psubw m1, m2 packsswb m0, m1 vinserti128 m1, m2, [mv+2*4*8-4], 0 vbroadcasti128 m3, [mv+2*4*8 ] vinserti128 m2, m3, [mv+3*4*8-4], 0 psubw m1, m3 vbroadcasti128 m3, [mv+3*4*8 ] psubw m2, m3 packsswb m1, m2 pabsb m0, m0 pabsb m1, m1 psubusb m0, m5 psubusb m1, m5 packsswb m0, m1 por m4, m0 add r1, 40 add r2, 4*8*5 dec t0d jge .lists ; Check nnz LOAD_BYTES_YMM nnz mova m2, [pb_1] por m0, m1 pminub m0, m2 pminub m4, m2 ; mv ? 1 : 0 paddb m0, m0 ; nnz ? 2 : 0 pmaxub m0, m4 vextracti128 [bs1], m0, 1 pshufb xm0, [transpose_shuf] mova [bs0], xm0 RET %macro LOAD_BYTES_ZMM 1 vpermd m1, m6, [%1-12] pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX %endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX INIT_ZMM avx512 cglobal deblock_strength, 5,5 mova m6, [load_bytes_zmm_shuf] shl r4d, 8 add r4d, 3 - (1<<8) vpbroadcastw m5, r4d mov r4d, 0x34cc34cc ; {1,-1} * 11001100b kmovb k1, r4d vpbroadcastd m4, r4d movifnidn t0d, r5m psrld m7, m6, 4 pxor xm3, xm3 .lists: vbroadcasti64x2 m2, [mv+32] vinserti64x2 m0, m2, [mv-32], 2 vbroadcasti64x2 m1, [mv+ 0] vinserti64x2 m0, m0, [mv- 4], 0 vbroadcasti64x2 m1 {k1}, [mv+64] vinserti64x2 m0, m0, [mv+60], 1 psubw m0, m1 vinserti64x2 m1, m1, [mv+28], 0 vbroadcasti64x2 m2 {k1}, [mv+96] vinserti64x2 m1, m1, [mv+92], 1 psubw m1, m2 packsswb m0, m1 pabsb m0, m0 psubusb m0, m5 LOAD_BYTES_ZMM ref pmaddubsw m1, m4 ; E-F F-G G-H H-I ... vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1 add r1, 40 add r2, 4*8*5 dec t0d jge .lists LOAD_BYTES_ZMM nnz mova ym2, [pb_1] vptestmw k1, m1, m1 vptestmw k2, m3, m3 vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0 vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0 vextracti128 [bs1], ym0, 1 pshufb xm0, [transpose_shuf] mova [bs0], xm0 RET x264-master/common/x86/deblock.h000066400000000000000000000242511502133446700165540ustar00rootroot00000000000000/***************************************************************************** * deblock.h: x86 deblocking ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_DEBLOCK_H #define X264_X86_DEBLOCK_H #define x264_deblock_v_luma_sse2 x264_template(deblock_v_luma_sse2) void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_luma_avx x264_template(deblock_v_luma_avx) void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_sse2 x264_template(deblock_h_luma_sse2) void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_avx x264_template(deblock_h_luma_avx) void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_chroma_sse2 x264_template(deblock_v_chroma_sse2) void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_chroma_avx x264_template(deblock_v_chroma_avx) void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_sse2 x264_template(deblock_h_chroma_sse2) void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_avx x264_template(deblock_h_chroma_avx) void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_mbaff_sse2 x264_template(deblock_h_chroma_mbaff_sse2) void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_mbaff_avx x264_template(deblock_h_chroma_mbaff_avx) void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_422_mmx2 x264_template(deblock_h_chroma_422_mmx2) void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_422_sse2 x264_template(deblock_h_chroma_422_sse2) void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_422_avx x264_template(deblock_h_chroma_422_avx) void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_luma_intra_sse2 x264_template(deblock_v_luma_intra_sse2) void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_luma_intra_avx x264_template(deblock_v_luma_intra_avx) void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_sse2 x264_template(deblock_h_luma_intra_sse2) void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_luma_intra_avx x264_template(deblock_h_luma_intra_avx) void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_sse2 x264_template(deblock_v_chroma_intra_sse2) void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_avx x264_template(deblock_v_chroma_intra_avx) void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_sse2 x264_template(deblock_h_chroma_intra_sse2) void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_avx x264_template(deblock_h_chroma_intra_avx) void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_422_intra_mmx2 x264_template(deblock_h_chroma_422_intra_mmx2) void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_422_intra_sse2 x264_template(deblock_h_chroma_422_intra_sse2) void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_422_intra_avx x264_template(deblock_h_chroma_422_intra_avx) void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_strength_sse2 x264_template(deblock_strength_sse2) void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_strength_ssse3 x264_template(deblock_strength_ssse3) void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_strength_avx x264_template(deblock_strength_avx) void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_strength_avx2 x264_template(deblock_strength_avx2) void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_strength_avx512 x264_template(deblock_strength_avx512) void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); #define x264_deblock_h_chroma_intra_mbaff_mmx2 x264_template(deblock_h_chroma_intra_mbaff_mmx2) void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_mbaff_sse2 x264_template(deblock_h_chroma_intra_mbaff_sse2) void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_mbaff_avx x264_template(deblock_h_chroma_intra_mbaff_avx) void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #if ARCH_X86 #define x264_deblock_h_luma_mmx2 x264_template(deblock_h_luma_mmx2) void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v8_luma_mmx2 x264_template(deblock_v8_luma_mmx2) void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_v_chroma_mmx2 x264_template(deblock_v_chroma_mmx2) void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_mmx2 x264_template(deblock_h_chroma_mmx2) void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_chroma_mbaff_mmx2 x264_template(deblock_h_chroma_mbaff_mmx2) void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #define x264_deblock_h_luma_intra_mmx2 x264_template(deblock_h_luma_intra_mmx2) void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v8_luma_intra_mmx2 x264_template(deblock_v8_luma_intra_mmx2) void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_mmx2 x264_template(deblock_v_chroma_intra_mmx2) void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_h_chroma_intra_mmx2 x264_template(deblock_h_chroma_intra_mmx2) void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_chroma_intra_mbaff_mmx2 x264_template(deblock_v_chroma_intra_mbaff_mmx2) void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #define x264_deblock_v_luma_mmx2 x264_template(deblock_v_luma_mmx2) #define x264_deblock_v_luma_intra_mmx2 x264_template(deblock_v_luma_intra_mmx2) #if HIGH_BIT_DEPTH void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #else // FIXME this wrapper has a significant cpu cost static ALWAYS_INLINE void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 ); x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 ); } static ALWAYS_INLINE void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta ) { x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta ); x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta ); } #endif // HIGH_BIT_DEPTH #endif #endif x264-master/common/x86/mc-a.asm000066400000000000000000001402421502133446700163160ustar00rootroot00000000000000;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Dylan Yudaken ;* Holger Lubitz ;* Min Chen ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 ch_shuf_adj: times 8 db 0 times 8 db 2 times 8 db 4 times 8 db 6 sq_1: times 1 dq 1 SECTION .text cextern pb_0 cextern pw_1 cextern pw_4 cextern pw_8 cextern pw_32 cextern pw_64 cextern pw_512 cextern pw_00ff cextern pw_pixel_max cextern sw_64 cextern pd_32 cextern deinterleave_shufd ;============================================================================= ; implicit weighted biprediction ;============================================================================= ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %if WIN64 DECLARE_REG_TMP 0,1,2,3,4,5,4,5 %macro AVG_START 0-1 0 PROLOGUE 6,7,%1 %endmacro %elif UNIX64 DECLARE_REG_TMP 0,1,2,3,4,5,7,8 %macro AVG_START 0-1 0 PROLOGUE 6,9,%1 %endmacro %else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 %macro AVG_START 0-1 0 PROLOGUE 0,7,%1 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m mov t4, r4m mov t5, r5m %endmacro %endif %macro AVG_END 0-1 2 ; rows lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t4, [t4+t5*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] sub eax, %1 jg .height_loop RET %endmacro %if HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 punpcklwd m0, m1 pmaddwd m0, m3 paddd m0, m4 psrad m0, 6 %endmacro %macro BIWEIGHT_START_MMX 0 movzx t6d, word r6m mov t7d, 64 sub t7d, t6d shl t7d, 16 add t6d, t7d movd m3, t6d SPLATD m3, m3 mova m4, [pd_32] pxor m5, m5 %endmacro %else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, m2 pmullw m1, m3 paddw m0, m1 paddw m0, m4 psraw m0, 6 %endmacro %macro BIWEIGHT_START_MMX 0 movd m2, r6m SPLATW m2, m2 ; weight_dst mova m3, [pw_64] psubw m3, m2 ; weight_src mova m4, [pw_32] ; rounding pxor m5, m5 %endmacro %endif ;HIGH_BIT_DEPTH %macro BIWEIGHT_SSSE3 2 movh m0, %1 movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 pmulhrsw m0, m4 %endmacro %macro BIWEIGHT_START_SSSE3 0 movzx t6d, byte r6m ; FIXME x86_64 %if mmsize > 16 vbroadcasti128 m4, [pw_512] %else mova m4, [pw_512] %endif lea t7d, [t6+(64<<8)] shl t6d, 8 sub t7d, t6d %if cpuflag(avx512) vpbroadcastw m3, t7d %else movd xm3, t7d %if cpuflag(avx2) vpbroadcastw m3, xm3 %else SPLATW m3, m3 ; weight_dst,src %endif %endif %endmacro %if HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/4 packssdw m0, m0 CLIPW m0, m5, m7 movh [%1], m0 %else SWAP 0, 6 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] packssdw m6, m0 CLIPW m6, m5, m7 mova [%1], m6 %endif %endmacro %else ;!HIGH_BIT_DEPTH %macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3] %if %4==mmsize/2 packuswb m0, m0 movh [%1], m0 %else SWAP 0, 6 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] packuswb m6, m0 mova [%1], m6 %endif %endmacro %endif ;HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 1-2 0 cglobal pixel_avg_weight_w%1 BIWEIGHT_START AVG_START %2 %if HIGH_BIT_DEPTH mova m7, [pw_pixel_max] %endif .height_loop: %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL) BIWEIGHT [t2], [t4] SWAP 0, 6 BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5] %if HIGH_BIT_DEPTH packssdw m6, m0 CLIPW m6, m5, m7 %else ;!HIGH_BIT_DEPTH packuswb m6, m0 %endif ;HIGH_BIT_DEPTH movlps [t0], m6 movhps [t0+SIZEOF_PIXEL*t1], m6 %else %assign x 0 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize BIWEIGHT_ROW t0+x, t2+x, t4+x, %1 BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1 %assign x x+mmsize %endrep %endif AVG_END %endmacro %define BIWEIGHT BIWEIGHT_MMX %define BIWEIGHT_START BIWEIGHT_START_MMX INIT_MMX mmx2 AVG_WEIGHT 4 AVG_WEIGHT 8 AVG_WEIGHT 16 %if HIGH_BIT_DEPTH INIT_XMM sse2 AVG_WEIGHT 4, 8 AVG_WEIGHT 8, 8 AVG_WEIGHT 16, 8 %else ;!HIGH_BIT_DEPTH INIT_XMM sse2 AVG_WEIGHT 8, 7 AVG_WEIGHT 16, 7 %define BIWEIGHT BIWEIGHT_SSSE3 %define BIWEIGHT_START BIWEIGHT_START_SSSE3 INIT_MMX ssse3 AVG_WEIGHT 4 INIT_XMM ssse3 AVG_WEIGHT 8, 7 AVG_WEIGHT 16, 7 INIT_YMM avx2 cglobal pixel_avg_weight_w16 BIWEIGHT_START AVG_START 5 .height_loop: movu xm0, [t2] movu xm1, [t4] vinserti128 m0, m0, [t2+t3], 1 vinserti128 m1, m1, [t4+t5], 1 SBUTTERFLY bw, 0, 1, 2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 mova [t0], xm0 vextracti128 [t0+t1], m0, 1 AVG_END INIT_YMM avx512 cglobal pixel_avg_weight_w8 BIWEIGHT_START kxnorb k1, k1, k1 kaddb k1, k1, k1 AVG_START 5 .height_loop: movq xm0, [t2] movq xm2, [t4] movq xm1, [t2+t3] movq xm5, [t4+t5] lea t2, [t2+t3*2] lea t4, [t4+t5*2] vpbroadcastq m0 {k1}, [t2] vpbroadcastq m2 {k1}, [t4] vpbroadcastq m1 {k1}, [t2+t3] vpbroadcastq m5 {k1}, [t4+t5] punpcklbw m0, m2 punpcklbw m1, m5 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 vextracti128 xmm1, m0, 1 movq [t0], xm0 movhps [t0+t1], xm0 lea t0, [t0+t1*2] movq [t0], xmm1 movhps [t0+t1], xmm1 AVG_END 4 INIT_ZMM avx512 cglobal pixel_avg_weight_w16 BIWEIGHT_START AVG_START 5 .height_loop: movu xm0, [t2] movu xm1, [t4] vinserti128 ym0, [t2+t3], 1 vinserti128 ym1, [t4+t5], 1 lea t2, [t2+t3*2] lea t4, [t4+t5*2] vinserti32x4 m0, [t2], 2 vinserti32x4 m1, [t4], 2 vinserti32x4 m0, [t2+t3], 3 vinserti32x4 m1, [t4+t5], 3 SBUTTERFLY bw, 0, 1, 2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 mova [t0], xm0 vextracti128 [t0+t1], ym0, 1 lea t0, [t0+t1*2] vextracti32x4 [t0], m0, 2 vextracti32x4 [t0+t1], m0, 3 AVG_END 4 %endif ;HIGH_BIT_DEPTH ;============================================================================= ; P frame explicit weighted prediction ;============================================================================= %if HIGH_BIT_DEPTH ; width %macro WEIGHT_START 1 mova m0, [r4+ 0] ; 1<= mmsize WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4 %assign x (x+mmsize) %else %assign w %3-x %if w == 20 %assign w 16 %endif WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4 %assign x (x+w) %endif %if x >= %3 %exitrep %endif %endrep %endmacro %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) ;----------------------------------------------------------------------------- %macro WEIGHTER 1 cglobal mc_weight_w%1, 6,6,8 FIX_STRIDES r1, r3 WEIGHT_START %1 %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 ; we can merge the shift step into the scale factor ; if (m3<<7) doesn't overflow an int16_t cmp byte [r4+1], 0 jz .fast %endif .loop: WEIGHT_TWO_ROW r2, r0, %1, 0 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .loop RET %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0 .fast: psllw m3, 7 .fastloop: WEIGHT_TWO_ROW r2, r0, %1, 1 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .fastloop RET %endif %endmacro INIT_MMX mmx2 WEIGHTER 4 WEIGHTER 8 WEIGHTER 12 WEIGHTER 16 WEIGHTER 20 INIT_XMM sse2 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 %if HIGH_BIT_DEPTH WEIGHTER 12 %else INIT_MMX ssse3 WEIGHTER 4 INIT_XMM ssse3 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 INIT_YMM avx2 WEIGHTER 8 WEIGHTER 16 WEIGHTER 20 %endif %macro OFFSET_OP 7 mov%6 m0, [%1] mov%6 m1, [%2] %if HIGH_BIT_DEPTH p%5usw m0, m2 p%5usw m1, m2 %ifidn %5,add pminsw m0, m3 pminsw m1, m3 %endif %else p%5usb m0, m2 p%5usb m1, m2 %endif mov%7 [%3], m0 mov%7 [%4], m1 %endmacro %macro OFFSET_TWO_ROW 4 %assign x 0 %rep %3 %if (%3*SIZEOF_PIXEL-x) >= mmsize OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a %assign x (x+mmsize) %else %if HIGH_BIT_DEPTH OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h %else OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d %endif %exitrep %endif %if x >= %3*SIZEOF_PIXEL %exitrep %endif %endrep %endmacro ;----------------------------------------------------------------------------- ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) ;----------------------------------------------------------------------------- %macro OFFSET 2 cglobal mc_offset%2_w%1, 6,6 FIX_STRIDES r1, r3 mova m2, [r4] %if HIGH_BIT_DEPTH %ifidn %2,add mova m3, [pw_pixel_max] %endif %endif .loop: OFFSET_TWO_ROW r2, r0, %1, %2 lea r0, [r0+r1*2] lea r2, [r2+r3*2] sub r5d, 2 jg .loop RET %endmacro %macro OFFSETPN 1 OFFSET %1, add OFFSET %1, sub %endmacro INIT_MMX mmx2 OFFSETPN 4 OFFSETPN 8 OFFSETPN 12 OFFSETPN 16 OFFSETPN 20 INIT_XMM sse2 OFFSETPN 12 OFFSETPN 16 OFFSETPN 20 %if HIGH_BIT_DEPTH INIT_XMM sse2 OFFSETPN 8 %endif ;============================================================================= ; pixel avg ;============================================================================= ;----------------------------------------------------------------------------- ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, ; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 2 cglobal pixel_avg_%1x%2 mov eax, %2 cmp dword r6m, 32 jne pixel_avg_weight_w%1 %+ SUFFIX %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads jmp pixel_avg_w%1_avx2 %else %if mmsize == 16 && %1 == 16 test dword r4m, 15 jz pixel_avg_w%1_sse2 %endif jmp pixel_avg_w%1_mmx2 %endif %endmacro ;----------------------------------------------------------------------------- ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, ; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- %macro AVG_FUNC 3 cglobal pixel_avg_w%1 AVG_START .height_loop: %assign x 0 %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize %2 m0, [t2+x] %2 m1, [t2+x+SIZEOF_PIXEL*t3] %if HIGH_BIT_DEPTH pavgw m0, [t4+x] pavgw m1, [t4+x+SIZEOF_PIXEL*t5] %else ;!HIGH_BIT_DEPTH pavgb m0, [t4+x] pavgb m1, [t4+x+SIZEOF_PIXEL*t5] %endif %3 [t0+x], m0 %3 [t0+x+SIZEOF_PIXEL*t1], m1 %assign x x+mmsize %endrep AVG_END %endmacro %if HIGH_BIT_DEPTH INIT_MMX mmx2 AVG_FUNC 4, movq, movq AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 AVG_FUNC 8, movq, movq AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 AVG_FUNC 16, movq, movq AVGH 16, 16 AVGH 16, 8 INIT_XMM sse2 AVG_FUNC 4, movq, movq AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 AVG_FUNC 8, movdqu, movdqa AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 AVG_FUNC 16, movdqu, movdqa AVGH 16, 16 AVGH 16, 8 %else ;!HIGH_BIT_DEPTH INIT_MMX mmx2 AVG_FUNC 4, movd, movd AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 AVG_FUNC 8, movq, movq AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 AVG_FUNC 16, movq, movq AVGH 16, 16 AVGH 16, 8 INIT_XMM sse2 AVG_FUNC 16, movdqu, movdqa AVGH 16, 16 AVGH 16, 8 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 INIT_XMM ssse3 AVGH 16, 16 AVGH 16, 8 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 INIT_MMX ssse3 AVGH 4, 16 AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 INIT_XMM avx2 AVG_FUNC 16, movdqu, movdqa AVGH 16, 16 AVGH 16, 8 INIT_XMM avx512 AVGH 16, 16 AVGH 16, 8 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 %endif ;HIGH_BIT_DEPTH ;============================================================================= ; pixel avg2 ;============================================================================= %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, ; uint16_t *src1, intptr_t src_stride, ; uint16_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W_ONE 1 cglobal pixel_avg2_w%1, 6,7,4 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2] movu m1, [r2+r3*2] %if cpuflag(avx) || mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r6] %else movu m2, [r2+r4] movu m3, [r2+r6] pavgw m0, m2 pavgw m1, m3 %endif mova [r0], m0 mova [r0+r1*2], m1 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r5d, 2 jg .height_loop RET %endmacro %macro AVG2_W_TWO 3 cglobal pixel_avg2_w%1, 6,7,8 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2] %2 m1, [r2+mmsize] movu m2, [r2+r3*2] %2 m3, [r2+r3*2+mmsize] %if mmsize == 8 pavgw m0, [r2+r4] pavgw m1, [r2+r4+mmsize] pavgw m2, [r2+r6] pavgw m3, [r2+r6+mmsize] %else movu m4, [r2+r4] %2 m5, [r2+r4+mmsize] movu m6, [r2+r6] %2 m7, [r2+r6+mmsize] pavgw m0, m4 pavgw m1, m5 pavgw m2, m6 pavgw m3, m7 %endif mova [r0], m0 %3 [r0+mmsize], m1 mova [r0+r1*2], m2 %3 [r0+r1*2+mmsize], m3 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r5d, 2 jg .height_loop RET %endmacro INIT_MMX mmx2 AVG2_W_ONE 4 AVG2_W_TWO 8, movu, mova INIT_XMM sse2 AVG2_W_ONE 8 AVG2_W_TWO 10, movd, movd AVG2_W_TWO 16, movu, mova INIT_YMM avx2 AVG2_W_ONE 16 INIT_MMX cglobal pixel_avg2_w10_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2+ 0] movu m1, [r2+ 8] movh m2, [r2+16] movu m3, [r2+r3*2+ 0] movu m4, [r2+r3*2+ 8] movh m5, [r2+r3*2+16] pavgw m0, [r2+r4+ 0] pavgw m1, [r2+r4+ 8] pavgw m2, [r2+r4+16] pavgw m3, [r2+r6+ 0] pavgw m4, [r2+r6+ 8] pavgw m5, [r2+r6+16] mova [r0+ 0], m0 mova [r0+ 8], m1 movh [r0+16], m2 mova [r0+r1*2+ 0], m3 mova [r0+r1*2+ 8], m4 movh [r0+r1*2+16], m5 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] sub r5d, 2 jg .height_loop RET cglobal pixel_avg2_w16_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3*2] .height_loop: movu m0, [r2+ 0] movu m1, [r2+ 8] movu m2, [r2+16] movu m3, [r2+24] movu m4, [r2+r3*2+ 0] movu m5, [r2+r3*2+ 8] movu m6, [r2+r3*2+16] movu m7, [r2+r3*2+24] pavgw m0, [r2+r4+ 0] pavgw m1, [r2+r4+ 8] pavgw m2, [r2+r4+16] pavgw m3, [r2+r4+24] pavgw m4, [r2+r6+ 0] pavgw m5, [r2+r6+ 8] pavgw m6, [r2+r6+16] pavgw m7, [r2+r6+24] mova [r0+ 0], m0 mova [r0+ 8], m1 mova [r0+16], m2 mova [r0+24], m3 mova [r0+r1*2+ 0], m4 mova [r0+r1*2+ 8], m5 mova [r0+r1*2+16], m6 mova [r0+r1*2+24], m7 lea r2, [r2+r3*2*2] lea r0, [r0+r1*2*2] sub r5d, 2 jg .height_loop RET cglobal pixel_avg2_w18_mmx2, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] movu m1, [r2+ 8] movu m2, [r2+16] movu m3, [r2+24] movh m4, [r2+32] pavgw m0, [r2+r4+ 0] pavgw m1, [r2+r4+ 8] pavgw m2, [r2+r4+16] pavgw m3, [r2+r4+24] pavgw m4, [r2+r4+32] mova [r0+ 0], m0 mova [r0+ 8], m1 mova [r0+16], m2 mova [r0+24], m3 movh [r0+32], m4 lea r2, [r2+r3*2] lea r0, [r0+r1*2] dec r5d jg .height_loop RET %macro PIXEL_AVG_W18 0 cglobal pixel_avg2_w18, 6,7 sub r4, r2 .height_loop: movu m0, [r2+ 0] movd xm2, [r2+32] %if mmsize == 32 pavgw m0, [r2+r4+ 0] movd xm1, [r2+r4+32] pavgw xm2, xm1 %else movu m1, [r2+16] movu m3, [r2+r4+ 0] movu m4, [r2+r4+16] movd m5, [r2+r4+32] pavgw m0, m3 pavgw m1, m4 pavgw m2, m5 mova [r0+16], m1 %endif mova [r0+ 0], m0 movd [r0+32], xm2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] dec r5d jg .height_loop RET %endmacro INIT_XMM sse2 PIXEL_AVG_W18 INIT_YMM avx2 PIXEL_AVG_W18 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, ; uint8_t *src1, intptr_t src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 cglobal pixel_avg2_w%1_mmx2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: %2 mm0, [r2] %2 mm1, [r2+r3] pavgb mm0, [r2+r4] pavgb mm1, [r2+r6] lea r2, [r2+r3*2] %2 [r0], mm0 %2 [r0+r1], mm1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET %endmacro INIT_MMX AVG2_W8 4, movd AVG2_W8 8, movq %macro AVG2_W16 2 cglobal pixel_avg2_w%1_mmx2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movq mm0, [r4] %2 mm1, [r4+8] movq mm2, [r4+r3] %2 mm3, [r4+r3+8] pavgb mm0, [r4+r2] pavgb mm1, [r4+r2+8] pavgb mm2, [r4+r6] pavgb mm3, [r4+r6+8] lea r4, [r4+r3*2] movq [r0], mm0 %2 [r0+8], mm1 movq [r0+r1], mm2 %2 [r0+r1+8], mm3 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET %endmacro AVG2_W16 12, movd AVG2_W16 16, movq cglobal pixel_avg2_w20_mmx2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movq mm0, [r4] movq mm1, [r4+8] movd mm2, [r4+16] movq mm3, [r4+r3] movq mm4, [r4+r3+8] movd mm5, [r4+r3+16] pavgb mm0, [r4+r2] pavgb mm1, [r4+r2+8] pavgb mm2, [r4+r2+16] pavgb mm3, [r4+r6] pavgb mm4, [r4+r6+8] pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] movq [r0], mm0 movq [r0+8], mm1 movd [r0+16], mm2 movq [r0+r1], mm3 movq [r0+r1+8], mm4 movd [r0+r1+16], mm5 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET INIT_XMM cglobal pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: movu m0, [r2] movu m2, [r2+r3] movu m1, [r2+r4] movu m3, [r2+r6] lea r2, [r2+r3*2] pavgb m0, m1 pavgb m2, m3 mova [r0], m0 mova [r0+r1], m2 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET cglobal pixel_avg2_w20_sse2, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movu m0, [r4] movu m2, [r4+r3] movu m1, [r4+r2] movu m3, [r4+r6] movd mm4, [r4+16] movd mm5, [r4+r3+16] pavgb m0, m1 pavgb m2, m3 pavgb mm4, [r4+r2+16] pavgb mm5, [r4+r6+16] lea r4, [r4+r3*2] mova [r0], m0 mova [r0+r1], m2 movd [r0+16], mm4 movd [r0+r1+16], mm5 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET INIT_YMM avx2 cglobal pixel_avg2_w20, 6,7 sub r2, r4 lea r6, [r2+r3] .height_loop: movu m0, [r4] movu m1, [r4+r3] pavgb m0, [r4+r2] pavgb m1, [r4+r6] lea r4, [r4+r3*2] mova [r0], m0 mova [r0+r1], m1 lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop RET ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. ; This particular instance is complicated by the fact that src1 and src2 ; can have different alignments. For simplicity and code size, only the ; MMX cacheline workaround is used. As a result, in the case of SSE2 ; pixel_avg, the cacheline check functions calls the SSE2 version if there ; is no cacheline split, and the MMX workaround if there is. %macro INIT_SHIFT 2 and eax, 7 shl eax, 3 movd %1, [sw_64] movd %2, eax psubw %1, %2 %endmacro %macro AVG_CACHELINE_START 0 %assign stack_offset 0 INIT_SHIFT mm6, mm7 mov eax, r4m INIT_SHIFT mm4, mm5 PROLOGUE 6,6 and r2, ~7 and r4, ~7 sub r4, r2 .height_loop: %endmacro %macro AVG_CACHELINE_LOOP 2 movq mm1, [r2+%1] movq mm0, [r2+8+%1] movq mm3, [r2+r4+%1] movq mm2, [r2+r4+8+%1] psrlq mm1, mm7 psllq mm0, mm6 psrlq mm3, mm5 psllq mm2, mm4 por mm0, mm1 por mm2, mm3 pavgb mm2, mm0 %2 [r0+%1], mm2 %endmacro %macro AVG_CACHELINE_FUNC 2 pixel_avg2_w%1_cache_mmx2: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq %if %1>8 AVG_CACHELINE_LOOP 8, movq %if %1>16 AVG_CACHELINE_LOOP 16, movd %endif %endif add r2, r3 add r0, r1 dec r5d jg .height_loop RET %endmacro %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set %if %1 == 12 ;w12 isn't needed because w16 is just as fast if there's no cacheline split %define cachesplit pixel_avg2_w16_cache_mmx2 %else %define cachesplit pixel_avg2_w%1_cache_mmx2 %endif cglobal pixel_avg2_w%1_cache%2_%3 mov eax, r2m and eax, %2-1 cmp eax, (%2-%1-(%1 % 8)) %if %1==12||%1==20 jbe pixel_avg2_w%1_%3 %else jb pixel_avg2_w%1_%3 %endif %if 0 ; or %1==8 - but the extra branch seems too expensive ja cachesplit %if ARCH_X86_64 test r4b, 1 %else test byte r4m, 1 %endif jz pixel_avg2_w%1_%3 %else or eax, r4m and eax, 7 jz pixel_avg2_w%1_%3 mov eax, r2m %endif %if mmsize==16 || (%1==8 && %2==64) AVG_CACHELINE_FUNC %1, %2 %else jmp cachesplit %endif %endmacro INIT_MMX AVG_CACHELINE_CHECK 8, 64, mmx2 AVG_CACHELINE_CHECK 12, 64, mmx2 %if ARCH_X86_64 == 0 AVG_CACHELINE_CHECK 16, 64, mmx2 AVG_CACHELINE_CHECK 20, 64, mmx2 AVG_CACHELINE_CHECK 8, 32, mmx2 AVG_CACHELINE_CHECK 12, 32, mmx2 AVG_CACHELINE_CHECK 16, 32, mmx2 AVG_CACHELINE_CHECK 20, 32, mmx2 %endif INIT_XMM AVG_CACHELINE_CHECK 16, 64, sse2 AVG_CACHELINE_CHECK 20, 64, sse2 ; computed jump assumes this loop is exactly 48 bytes %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment ALIGN 16 avg_w16_align%1_%2_ssse3: %if %1==0 && %2==0 movdqa xmm1, [r2] pavgb xmm1, [r2+r4] add r2, r3 %elif %1==0 movdqa xmm1, [r2+r4+16] palignr xmm1, [r2+r4], %2 pavgb xmm1, [r2] add r2, r3 %elif %2&15==0 movdqa xmm1, [r2+16] palignr xmm1, [r2], %1 pavgb xmm1, [r2+r4] add r2, r3 %else movdqa xmm1, [r2+16] movdqa xmm2, [r2+r4+16] palignr xmm1, [r2], %1 palignr xmm2, [r2+r4], %2&15 add r2, r3 pavgb xmm1, xmm2 %endif movdqa [r0], xmm1 add r0, r1 dec r5d jg avg_w16_align%1_%2_ssse3 ret %if %1==0 ; make sure the first ones don't end up short ALIGN 16 times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop %endif %endmacro cglobal pixel_avg2_w16_cache64_ssse3 %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized mov eax, r2m and eax, 0x3f cmp eax, 0x30 jb pixel_avg2_w16_sse2 or eax, r4m and eax, 7 jz pixel_avg2_w16_sse2 %endif PROLOGUE 6, 8 lea r6, [r4+r2] and r4, ~0xf and r6, 0x1f and r2, ~0xf lea r6, [r6*3] ;(offset + align*2)*3 sub r4, r2 shl r6, 4 ;jump = (offset + align*2)*48 %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3) %if ARCH_X86_64 lea r7, [avg_w16_addr] add r6, r7 %else lea r6, [avg_w16_addr + r6] %endif TAIL_CALL r6, 1 %assign j 0 %assign k 1 %rep 16 AVG16_CACHELINE_LOOP_SSSE3 j, j AVG16_CACHELINE_LOOP_SSSE3 j, k %assign j j+1 %assign k k+1 %endrep %endif ; !HIGH_BIT_DEPTH ;============================================================================= ; pixel copy ;============================================================================= %macro COPY1 2 movu m0, [r2] movu m1, [r2+r3] movu m2, [r2+r3*2] movu m3, [r2+%2] mova [r0], m0 mova [r0+r1], m1 mova [r0+r1*2], m2 mova [r0+%1], m3 %endmacro %macro COPY2 2-4 0, 1 movu m0, [r2+%3*mmsize] movu m1, [r2+%4*mmsize] movu m2, [r2+r3+%3*mmsize] movu m3, [r2+r3+%4*mmsize] mova [r0+%3*mmsize], m0 mova [r0+%4*mmsize], m1 mova [r0+r1+%3*mmsize], m2 mova [r0+r1+%4*mmsize], m3 movu m0, [r2+r3*2+%3*mmsize] movu m1, [r2+r3*2+%4*mmsize] movu m2, [r2+%2+%3*mmsize] movu m3, [r2+%2+%4*mmsize] mova [r0+r1*2+%3*mmsize], m0 mova [r0+r1*2+%4*mmsize], m1 mova [r0+%1+%3*mmsize], m2 mova [r0+%1+%4*mmsize], m3 %endmacro %macro COPY4 2 COPY2 %1, %2, 0, 1 COPY2 %1, %2, 2, 3 %endmacro ;----------------------------------------------------------------------------- ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, ; uint8_t *src, intptr_t i_src_stride, int i_height ) ;----------------------------------------------------------------------------- INIT_MMX cglobal mc_copy_w4_mmx, 4,6 FIX_STRIDES r1, r3 cmp dword r4m, 4 lea r5, [r3*3] lea r4, [r1*3] je .end %if HIGH_BIT_DEPTH == 0 %define mova movd %define movu movd %endif COPY1 r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] .end: COPY1 r4, r5 RET %macro MC_COPY 1 %assign %%w %1*SIZEOF_PIXEL/mmsize %if %%w > 0 cglobal mc_copy_w%1, 5,7 FIX_STRIDES r1, r3 lea r6, [r3*3] lea r5, [r1*3] .height_loop: COPY %+ %%w r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop RET %endif %endmacro INIT_MMX mmx MC_COPY 8 MC_COPY 16 INIT_XMM sse MC_COPY 8 MC_COPY 16 INIT_XMM aligned, sse MC_COPY 16 %if HIGH_BIT_DEPTH INIT_YMM avx MC_COPY 16 INIT_YMM aligned, avx MC_COPY 16 %endif ;============================================================================= ; prefetch ;============================================================================= ; assumes 64 byte cachelines ; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, ; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %macro PREFETCH_FENC 1 %if ARCH_X86_64 cglobal prefetch_fenc_%1, 5,5 FIX_STRIDES r1, r3 and r4d, 3 mov eax, r4d imul r4d, r1d lea r0, [r0+r4*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] imul eax, r3d lea r2, [r2+rax*2+64*SIZEOF_PIXEL] prefetcht0 [r2] prefetcht0 [r2+r3] %ifidn %1, 422 lea r2, [r2+r3*2] prefetcht0 [r2] prefetcht0 [r2+r3] %endif RET %else cglobal prefetch_fenc_%1, 0,3 mov r2, r4m mov r1, r1m mov r0, r0m FIX_STRIDES r1 and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] mov r2, r4m mov r1, r3m mov r0, r2m FIX_STRIDES r1 and r2, 3 imul r2, r1 lea r0, [r0+r2*2+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] %ifidn %1, 422 lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] %endif ret %endif ; ARCH_X86_64 %endmacro INIT_MMX mmx2 PREFETCH_FENC 420 PREFETCH_FENC 422 %if ARCH_X86_64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 2 %endif cglobal prefetch_fenc_400, 2,3 movifnidn t0d, r4m FIX_STRIDES r1 and t0d, 3 imul t0d, r1d lea r0, [r0+t0*4+64*SIZEOF_PIXEL] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] RET ;----------------------------------------------------------------------------- ; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal prefetch_ref, 3,3 FIX_STRIDES r1 dec r2d and r2d, r1d lea r0, [r0+r2*8+64*SIZEOF_PIXEL] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] lea r0, [r0+r1*4] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] RET ;============================================================================= ; chroma MC ;============================================================================= %if ARCH_X86_64 DECLARE_REG_TMP 6,7,8 %else DECLARE_REG_TMP 0,1,2 %endif %macro MC_CHROMA_START 1 %if ARCH_X86_64 PROLOGUE 0,9,%1 %else PROLOGUE 0,6,%1 %endif movifnidn r3, r3mp movifnidn r4d, r4m movifnidn r5d, r5m movifnidn t0d, r6m mov t2d, t0d mov t1d, r5d sar t0d, 3 sar t1d, 3 imul t0d, r4d lea t0d, [t0+t1*2] FIX_STRIDES t0d movsxdifnidn t0, t0d add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride %endmacro %if HIGH_BIT_DEPTH %macro UNPACK_UNALIGNED 4 movu %1, [%4+0] movu %2, [%4+4] punpckhwd %3, %1, %2 punpcklwd %1, %2 %if mmsize == 8 mova %2, %1 punpcklwd %1, %3 punpckhwd %2, %3 %else shufps %2, %1, %3, q3131 shufps %1, %3, q2020 %endif %endmacro %else ; !HIGH_BIT_DEPTH %macro UNPACK_UNALIGNED 3 %if mmsize == 8 punpcklwd %1, %3 %else movh %2, %3 punpcklwd %1, %2 %endif %endmacro %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride, ; uint8_t *src, intptr_t src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- %macro MC_CHROMA 0 cglobal mc_chroma MC_CHROMA_START 0 FIX_STRIDES r4 and r5d, 7 %if ARCH_X86_64 jz .mc1dy %endif and t2d, 7 %if ARCH_X86_64 jz .mc1dx %endif shl r5d, 16 add t2d, r5d mov t0d, t2d shl t2d, 8 sub t2d, t0d add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y) cmp dword r7m, 4 %if mmsize==8 .skip_prologue: %else jl mc_chroma_mmx2 %+ .skip_prologue WIN64_SPILL_XMM 9 %endif movd m5, t2d movifnidn r0, r0mp movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r8m pxor m6, m6 punpcklbw m5, m6 %if mmsize==8 pshufw m7, m5, q3232 pshufw m6, m5, q0000 pshufw m5, m5, q1111 jge .width4 %else %if WIN64 cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM %endif pshufd m7, m5, q1111 punpcklwd m5, m5 pshufd m6, m5, q0000 pshufd m5, m5, q1111 jg .width8 %endif %if HIGH_BIT_DEPTH add r2, r2 UNPACK_UNALIGNED m0, m1, m2, r3 %else movu m0, [r3] UNPACK_UNALIGNED m0, m1, [r3+2] mova m1, m0 pand m0, [pw_00ff] psrlw m1, 8 %endif ; HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 packssdw m0, m1 SWAP 3, 0 ALIGN 4 .loop2: %if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m1, m2, r3+r4 pmullw m3, m6 %else ; !HIGH_BIT_DEPTH movu m0, [r3+r4] UNPACK_UNALIGNED m0, m1, [r3+r4+2] pmullw m3, m6 mova m1, m0 pand m0, [pw_00ff] psrlw m1, 8 %endif ; HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 mova m2, [pw_32] packssdw m0, m1 paddw m2, m3 mova m3, m0 pmullw m0, m5 paddw m0, m2 psrlw m0, 6 %if HIGH_BIT_DEPTH movh [r0], m0 %if mmsize == 8 psrlq m0, 32 movh [r1], m0 %else movhps [r1], m0 %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m0 movd [r0], m0 %if mmsize==8 psrlq m0, 16 %else psrldq m0, 4 %endif movd [r1], m0 %endif ; HIGH_BIT_DEPTH add r3, r4 add r0, r2 add r1, r2 dec r5d jg .loop2 RET %if mmsize==8 .width4: %if ARCH_X86_64 mov t0, r0 mov t1, r1 mov t2, r3 %if WIN64 %define multy0 r4m %else %define multy0 [rsp-8] %endif mova multy0, m5 %else mov r3m, r3 %define multy0 r4m mova multy0, m5 %endif %else .width8: %if ARCH_X86_64 %define multy0 m8 SWAP 8, 5 %else %define multy0 r0m mova multy0, m5 %endif %endif FIX_STRIDES r2 .loopx: %if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m2, m4, r3 UNPACK_UNALIGNED m1, m3, m5, r3+mmsize %else movu m0, [r3] movu m1, [r3+mmsize/2] UNPACK_UNALIGNED m0, m2, [r3+2] UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] psrlw m2, m0, 8 psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] %endif pmaddwd m0, m7 pmaddwd m2, m7 pmaddwd m1, m7 pmaddwd m3, m7 packssdw m0, m2 packssdw m1, m3 SWAP 4, 0 SWAP 5, 1 add r3, r4 ALIGN 4 .loop4: %if HIGH_BIT_DEPTH UNPACK_UNALIGNED m0, m1, m2, r3 pmaddwd m0, m7 pmaddwd m1, m7 packssdw m0, m1 UNPACK_UNALIGNED m1, m2, m3, r3+mmsize pmaddwd m1, m7 pmaddwd m2, m7 packssdw m1, m2 %else ; !HIGH_BIT_DEPTH movu m0, [r3] movu m1, [r3+mmsize/2] UNPACK_UNALIGNED m0, m2, [r3+2] UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2] psrlw m2, m0, 8 psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] pmaddwd m0, m7 pmaddwd m2, m7 pmaddwd m1, m7 pmaddwd m3, m7 packssdw m0, m2 packssdw m1, m3 %endif ; HIGH_BIT_DEPTH pmullw m4, m6 pmullw m5, m6 mova m2, [pw_32] paddw m3, m2, m5 paddw m2, m4 mova m4, m0 mova m5, m1 pmullw m0, multy0 pmullw m1, multy0 paddw m0, m2 paddw m1, m3 psrlw m0, 6 psrlw m1, 6 %if HIGH_BIT_DEPTH movh [r0], m0 movh [r0+mmsize/2], m1 %if mmsize==8 psrlq m0, 32 psrlq m1, 32 movh [r1], m0 movh [r1+mmsize/2], m1 %else movhps [r1], m0 movhps [r1+mmsize/2], m1 %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m1 %if mmsize==8 pshufw m1, m0, q0020 pshufw m0, m0, q0031 movd [r0], m1 movd [r1], m0 %else pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 %endif %endif ; HIGH_BIT_DEPTH add r3, r4 add r0, r2 add r1, r2 dec r5d jg .loop4 %if mmsize!=8 RET %else sub dword r7m, 4 jg .width8 RET .width8: %if ARCH_X86_64 lea r3, [t2+8*SIZEOF_PIXEL] lea r0, [t0+4*SIZEOF_PIXEL] lea r1, [t1+4*SIZEOF_PIXEL] %else mov r3, r3m mov r0, r0m mov r1, r1m add r3, 8*SIZEOF_PIXEL add r0, 4*SIZEOF_PIXEL add r1, 4*SIZEOF_PIXEL %endif mov r5d, r8m jmp .loopx %endif %if ARCH_X86_64 ; too many regs for x86_32 RESET_MM_PERMUTATION %if WIN64 %assign stack_offset stack_offset - stack_size_padded %assign stack_size_padded 0 %assign xmm_regs_used 0 %endif .mc1dy: and t2d, 7 movd m5, t2d mov r6d, r4d ; pel_offset = dx ? 2 : src_stride jmp .mc1d .mc1dx: movd m5, r5d mov r6d, 2*SIZEOF_PIXEL .mc1d: %if HIGH_BIT_DEPTH && mmsize == 16 WIN64_SPILL_XMM 8 %endif mova m4, [pw_8] SPLATW m5, m5 psubw m4, m5 movifnidn r0, r0mp movifnidn r1, r1mp movifnidn r2d, r2m FIX_STRIDES r2 movifnidn r5d, r8m cmp dword r7m, 4 jg .mc1d_w8 mov r7, r2 mov r8, r4 %if mmsize!=8 shr r5d, 1 %endif .loop1d_w4: %if HIGH_BIT_DEPTH %if mmsize == 8 movq m0, [r3+0] movq m2, [r3+8] movq m1, [r3+r6+0] movq m3, [r3+r6+8] %else movu m0, [r3] movu m1, [r3+r6] add r3, r8 movu m2, [r3] movu m3, [r3+r6] %endif SBUTTERFLY wd, 0, 2, 6 SBUTTERFLY wd, 1, 3, 7 SBUTTERFLY wd, 0, 2, 6 SBUTTERFLY wd, 1, 3, 7 %if mmsize == 16 SBUTTERFLY wd, 0, 2, 6 SBUTTERFLY wd, 1, 3, 7 %endif %else ; !HIGH_BIT_DEPTH movq m0, [r3] movq m1, [r3+r6] %if mmsize!=8 add r3, r8 movhps m0, [r3] movhps m1, [r3+r6] %endif psrlw m2, m0, 8 psrlw m3, m1, 8 pand m0, [pw_00ff] pand m1, [pw_00ff] %endif ; HIGH_BIT_DEPTH pmullw m0, m4 pmullw m1, m5 pmullw m2, m4 pmullw m3, m5 paddw m0, [pw_4] paddw m2, [pw_4] paddw m0, m1 paddw m2, m3 psrlw m0, 3 psrlw m2, 3 %if HIGH_BIT_DEPTH %if mmsize == 8 xchg r4, r8 xchg r2, r7 %endif movq [r0], m0 movq [r1], m2 %if mmsize == 16 add r0, r7 add r1, r7 movhps [r0], m0 movhps [r1], m2 %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m2 %if mmsize==8 xchg r4, r8 xchg r2, r7 movd [r0], m0 psrlq m0, 32 movd [r1], m0 %else movhlps m1, m0 movd [r0], m0 movd [r1], m1 add r0, r7 add r1, r7 psrldq m0, 4 psrldq m1, 4 movd [r0], m0 movd [r1], m1 %endif %endif ; HIGH_BIT_DEPTH add r3, r4 add r0, r2 add r1, r2 dec r5d jg .loop1d_w4 RET .mc1d_w8: sub r2, 4*SIZEOF_PIXEL sub r4, 8*SIZEOF_PIXEL mov r7, 4*SIZEOF_PIXEL mov r8, 8*SIZEOF_PIXEL %if mmsize==8 shl r5d, 1 %endif jmp .loop1d_w4 %endif ; ARCH_X86_64 %endmacro ; MC_CHROMA %macro MC_CHROMA_SSSE3 0 cglobal mc_chroma MC_CHROMA_START 10-cpuflag(avx2) and r5d, 7 and t2d, 7 mov t0d, r5d shl t0d, 8 sub t0d, r5d mov r5d, 8 add t0d, 8 sub r5d, t2d imul t2d, t0d ; (x*255+8)*y imul r5d, t0d ; (x*255+8)*(8-y) movd xm6, t2d movd xm7, r5d %if cpuflag(cache64) mov t0d, r3d and t0d, 7 %if ARCH_X86_64 lea t1, [ch_shuf_adj] movddup xm5, [t1 + t0*4] %else movddup xm5, [ch_shuf_adj + t0*4] %endif paddb xm5, [ch_shuf] and r3, ~7 %else mova m5, [ch_shuf] %endif movifnidn r0, r0mp movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r8m %if cpuflag(avx2) vpbroadcastw m6, xm6 vpbroadcastw m7, xm7 %else SPLATW m6, m6 SPLATW m7, m7 %endif %if ARCH_X86_64 %define shiftround m8 mova m8, [pw_512] %else %define shiftround [pw_512] %endif cmp dword r7m, 4 jg .width8 %if cpuflag(avx2) .loop4: movu xm0, [r3] movu xm1, [r3+r4] vinserti128 m0, m0, [r3+r4], 1 vinserti128 m1, m1, [r3+r4*2], 1 pshufb m0, m5 pshufb m1, m5 pmaddubsw m0, m7 pmaddubsw m1, m6 paddw m0, m1 pmulhrsw m0, shiftround packuswb m0, m0 vextracti128 xm1, m0, 1 movd [r0], xm0 movd [r0+r2], xm1 psrldq xm0, 4 psrldq xm1, 4 movd [r1], xm0 movd [r1+r2], xm1 lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop4 RET .width8: movu xm0, [r3] vinserti128 m0, m0, [r3+8], 1 pshufb m0, m5 .loop8: movu xm3, [r3+r4] vinserti128 m3, m3, [r3+r4+8], 1 pshufb m3, m5 pmaddubsw m1, m0, m7 pmaddubsw m2, m3, m6 pmaddubsw m3, m3, m7 movu xm0, [r3+r4*2] vinserti128 m0, m0, [r3+r4*2+8], 1 pshufb m0, m5 pmaddubsw m4, m0, m6 paddw m1, m2 paddw m3, m4 pmulhrsw m1, shiftround pmulhrsw m3, shiftround packuswb m1, m3 mova m2, [deinterleave_shufd] vpermd m1, m2, m1 vextracti128 xm2, m1, 1 movq [r0], xm1 movhps [r1], xm1 movq [r0+r2], xm2 movhps [r1+r2], xm2 %else movu m0, [r3] pshufb m0, m5 .loop4: movu m1, [r3+r4] pshufb m1, m5 movu m3, [r3+r4*2] pshufb m3, m5 mova m4, m3 pmaddubsw m0, m7 pmaddubsw m2, m1, m7 pmaddubsw m1, m6 pmaddubsw m3, m6 paddw m1, m0 paddw m3, m2 pmulhrsw m1, shiftround pmulhrsw m3, shiftround mova m0, m4 packuswb m1, m3 movd [r0], m1 %if cpuflag(sse4) pextrd [r1], m1, 1 pextrd [r0+r2], m1, 2 pextrd [r1+r2], m1, 3 %else movhlps m3, m1 movd [r0+r2], m3 psrldq m1, 4 psrldq m3, 4 movd [r1], m1 movd [r1+r2], m3 %endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop4 RET .width8: movu m0, [r3] pshufb m0, m5 movu m1, [r3+8] pshufb m1, m5 %if ARCH_X86_64 SWAP 9, 6 %define mult1 m9 %else mova r0m, m6 %define mult1 r0m %endif .loop8: movu m2, [r3+r4] pshufb m2, m5 movu m3, [r3+r4+8] pshufb m3, m5 mova m4, m2 mova m6, m3 pmaddubsw m0, m7 pmaddubsw m1, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 paddw m0, m2 paddw m1, m3 pmulhrsw m0, shiftround ; x + 32 >> 6 pmulhrsw m1, shiftround packuswb m0, m1 pshufd m0, m0, q3120 movq [r0], m0 movhps [r1], m0 movu m2, [r3+r4*2] pshufb m2, m5 movu m3, [r3+r4*2+8] pshufb m3, m5 mova m0, m2 mova m1, m3 pmaddubsw m4, m7 pmaddubsw m6, m7 pmaddubsw m2, mult1 pmaddubsw m3, mult1 paddw m2, m4 paddw m3, m6 pmulhrsw m2, shiftround pmulhrsw m3, shiftround packuswb m2, m3 pshufd m2, m2, q3120 movq [r0+r2], m2 movhps [r1+r2], m2 %endif lea r3, [r3+r4*2] lea r0, [r0+r2*2] lea r1, [r1+r2*2] sub r5d, 2 jg .loop8 RET %endmacro %if HIGH_BIT_DEPTH INIT_MMX mmx2 MC_CHROMA INIT_XMM sse2 MC_CHROMA INIT_XMM avx MC_CHROMA %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 MC_CHROMA INIT_XMM sse2 MC_CHROMA INIT_XMM ssse3 MC_CHROMA_SSSE3 INIT_XMM cache64, ssse3 MC_CHROMA_SSSE3 INIT_XMM avx MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 INIT_YMM avx2 MC_CHROMA_SSSE3 %endif ; HIGH_BIT_DEPTH x264-master/common/x86/mc-a2.asm000066400000000000000000002246161502133446700164100ustar00rootroot00000000000000;***************************************************************************** ;* mc-a2.asm: x86 motion compensation ;***************************************************************************** ;* Copyright (C) 2005-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Holger Lubitz ;* Mathieu Monnier ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 64 %if HIGH_BIT_DEPTH v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20, db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62 v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00 v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15 v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800 copy_swap_shuf: SHUFFLE_MASK_W 1,0,3,2,5,4,7,6 deinterleave_shuf: SHUFFLE_MASK_W 0,2,4,6,1,3,5,7 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 %else deinterleave_rgb_shuf: db 0, 3, 6, 9, 0, 3, 6, 9, 1, 4, 7,10, 2, 5, 8,11 db 0, 4, 8,12, 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14 copy_swap_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14 deinterleave_shuf: db 0, 2, 4, 6, 8,10,12,14, 1, 3, 5, 7, 9,11,13,15 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif ; !HIGH_BIT_DEPTH pw_1024: times 16 dw 1024 filt_mul20: times 32 db 20 filt_mul15: times 16 db 1, -5 filt_mul51: times 16 db -5, 1 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6 db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14 ; bits 0-3: pshufb, bits 4-7: AVX-512 vpermq mbtree_fix8_pack_shuf: db 0x01,0x20,0x43,0x62,0x15,0x34,0x57,0x76,0x09,0x08,0x0b,0x0a,0x0d,0x0c,0x0f,0x0e pf_256: times 4 dd 256.0 pf_inv16777216: times 4 dd 0x1p-24 pd_16: times 4 dd 16 pad10: times 8 dw 10*PIXEL_MAX pad20: times 8 dw 20*PIXEL_MAX pad30: times 8 dw 30*PIXEL_MAX depad: times 4 dd 32*20*PIXEL_MAX + 512 tap1: times 4 dw 1, -5 tap2: times 4 dw 20, 20 tap3: times 4 dw -5, 1 pw_0xc000: times 8 dw 0xc000 pw_31: times 8 dw 31 pd_4: times 4 dd 4 SECTION .text cextern pb_0 cextern pw_1 cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_512 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max cextern pw_0to15 cextern pd_8 cextern pd_0123 cextern pd_ffff cextern deinterleave_shufd %macro LOAD_ADD 4 movh %4, %3 movh %1, %2 punpcklbw %4, m0 punpcklbw %1, m0 paddw %1, %4 %endmacro %macro LOAD_ADD_2 6 mova %5, %3 mova %1, %4 punpckhbw %6, %5, m0 punpcklbw %5, m0 punpckhbw %2, %1, m0 punpcklbw %1, m0 paddw %1, %5 paddw %2, %6 %endmacro %macro FILT_V2 6 psubw %1, %2 ; a-b psubw %4, %5 psubw %2, %3 ; b-c psubw %5, %6 psllw %2, 2 psllw %5, 2 psubw %1, %2 ; a-5*b+4*c psllw %3, 4 psubw %4, %5 psllw %6, 4 paddw %1, %3 ; a-5*b+20*c paddw %4, %6 %endmacro %macro FILT_H 3 psubw %1, %2 ; a-b psraw %1, 2 ; (a-b)/4 psubw %1, %2 ; (a-b)/4-b paddw %1, %3 ; (a-b)/4-b+c psraw %1, 2 ; ((a-b)/4-b+c)/4 paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 %endmacro %macro FILT_H2 6 psubw %1, %2 psubw %4, %5 psraw %1, 2 psraw %4, 2 psubw %1, %2 psubw %4, %5 paddw %1, %3 paddw %4, %6 psraw %1, 2 psraw %4, 2 paddw %1, %3 paddw %4, %6 %endmacro %macro FILT_PACK 3-5 %if cpuflag(ssse3) pmulhrsw %1, %3 pmulhrsw %2, %3 %else paddw %1, %3 paddw %2, %3 %if %0 == 5 psubusw %1, %5 psubusw %2, %5 psrlw %1, %4 psrlw %2, %4 %else psraw %1, %4 psraw %2, %4 %endif %endif %if HIGH_BIT_DEPTH == 0 packuswb %1, %2 %endif %endmacro ;The hpel_filter routines use non-temporal writes for output. ;The following defines may be uncommented for testing. ;Doing the hpel_filter temporal may be a win if the last level cache ;is big enough (preliminary benching suggests on the order of 4* framesize). ;%define movntq movq ;%define movntps movaps ;%define sfence %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width ); ;----------------------------------------------------------------------------- %macro HPEL_FILTER 0 cglobal hpel_filter_v, 5,6,11 FIX_STRIDES r3, r4 lea r5, [r1+r3] sub r1, r3 sub r1, r3 %if num_mmregs > 8 mova m8, [pad10] mova m9, [pad20] mova m10, [pad30] %define s10 m8 %define s20 m9 %define s30 m10 %else %define s10 [pad10] %define s20 [pad20] %define s30 [pad30] %endif add r0, r4 add r2, r4 neg r4 mova m7, [pw_pixel_max] pxor m0, m0 .loop: mova m1, [r1] mova m2, [r1+r3] mova m3, [r1+r3*2] mova m4, [r1+mmsize] mova m5, [r1+r3+mmsize] mova m6, [r1+r3*2+mmsize] paddw m1, [r5+r3*2] paddw m2, [r5+r3] paddw m3, [r5] paddw m4, [r5+r3*2+mmsize] paddw m5, [r5+r3+mmsize] paddw m6, [r5+mmsize] add r1, 2*mmsize add r5, 2*mmsize FILT_V2 m1, m2, m3, m4, m5, m6 mova m6, [pw_16] psubw m1, s20 psubw m4, s20 mova [r2+r4], m1 mova [r2+r4+mmsize], m4 paddw m1, s30 paddw m4, s30 FILT_PACK m1, m4, m6, 5, s10 CLIPW m1, m0, m7 CLIPW m4, m0, m7 mova [r0+r4], m1 mova [r0+r4+mmsize], m4 add r4, 2*mmsize jl .loop RET ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_c, 3,3,10 add r2, r2 add r0, r2 add r1, r2 neg r2 mova m0, [tap1] mova m7, [tap3] %if num_mmregs > 8 mova m8, [tap2] mova m9, [depad] %define s1 m8 %define s2 m9 %else %define s1 [tap2] %define s2 [depad] %endif .loop: movu m1, [r1+r2-4] movu m2, [r1+r2-2] mova m3, [r1+r2+0] movu m4, [r1+r2+2] movu m5, [r1+r2+4] movu m6, [r1+r2+6] pmaddwd m1, m0 pmaddwd m2, m0 pmaddwd m3, s1 pmaddwd m4, s1 pmaddwd m5, m7 pmaddwd m6, m7 paddd m1, s2 paddd m2, s2 paddd m3, m5 paddd m4, m6 paddd m1, m3 paddd m2, m4 psrad m1, 10 psrad m2, 10 pslld m2, 16 pand m1, [pd_ffff] por m1, m2 CLIPW m1, [pb_0], [pw_pixel_max] mova [r0+r2], m1 add r2, mmsize jl .loop RET ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_h, 3,4,8 %define src r1+r2 add r2, r2 add r0, r2 add r1, r2 neg r2 mova m0, [pw_pixel_max] .loop: movu m1, [src-4] movu m2, [src-2] mova m3, [src+0] movu m6, [src+2] movu m4, [src+4] movu m5, [src+6] paddw m3, m6 ; c0 paddw m2, m4 ; b0 paddw m1, m5 ; a0 %if mmsize == 16 movu m4, [src-4+mmsize] movu m5, [src-2+mmsize] %endif movu m7, [src+4+mmsize] movu m6, [src+6+mmsize] paddw m5, m7 ; b1 paddw m4, m6 ; a1 movu m7, [src+2+mmsize] mova m6, [src+0+mmsize] paddw m6, m7 ; c1 FILT_H2 m1, m2, m3, m4, m5, m6 mova m7, [pw_1] pxor m2, m2 FILT_PACK m1, m4, m7, 1 CLIPW m1, m2, m0 CLIPW m4, m2, m0 mova [r0+r2], m1 mova [r0+r2+mmsize], m4 add r2, mmsize*2 jl .loop RET %endmacro ; HPEL_FILTER INIT_MMX mmx2 HPEL_FILTER INIT_XMM sse2 HPEL_FILTER %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 %macro HPEL_V 1 ;----------------------------------------------------------------------------- ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_v, 5,6,%1 lea r5, [r1+r3] sub r1, r3 sub r1, r3 add r0, r4 lea r2, [r2+r4*2] neg r4 %if cpuflag(ssse3) mova m0, [filt_mul15] %else pxor m0, m0 %endif .loop: %if cpuflag(ssse3) mova m1, [r1] mova m4, [r1+r3] mova m2, [r5+r3*2] mova m5, [r5+r3] mova m3, [r1+r3*2] mova m6, [r5] SBUTTERFLY bw, 1, 4, 7 SBUTTERFLY bw, 2, 5, 7 SBUTTERFLY bw, 3, 6, 7 pmaddubsw m1, m0 pmaddubsw m4, m0 pmaddubsw m2, m0 pmaddubsw m5, m0 pmaddubsw m3, [filt_mul20] pmaddubsw m6, [filt_mul20] paddw m1, m2 paddw m4, m5 paddw m1, m3 paddw m4, m6 mova m7, [pw_1024] %else LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1 LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1 LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0 LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1 FILT_V2 m1, m2, m3, m4, m5, m6 mova m7, [pw_16] %endif %if mmsize==32 mova [r2+r4*2], xm1 mova [r2+r4*2+mmsize/2], xm4 vextracti128 [r2+r4*2+mmsize], m1, 1 vextracti128 [r2+r4*2+mmsize*3/2], m4, 1 %else mova [r2+r4*2], m1 mova [r2+r4*2+mmsize], m4 %endif FILT_PACK m1, m4, m7, 5 movnta [r0+r4], m1 add r1, mmsize add r5, mmsize add r4, mmsize jl .loop RET %endmacro ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal hpel_filter_c, 3,3 add r0, r2 lea r1, [r1+r2*2] neg r2 %define src r1+r2*2 movq m7, [pw_32] .loop: movq m1, [src-4] movq m2, [src-2] movq m3, [src ] movq m4, [src+4] movq m5, [src+6] paddw m3, [src+2] ; c0 paddw m2, m4 ; b0 paddw m1, m5 ; a0 movq m6, [src+8] paddw m4, [src+14] ; a1 paddw m5, [src+12] ; b1 paddw m6, [src+10] ; c1 FILT_H2 m1, m2, m3, m4, m5, m6 FILT_PACK m1, m4, m7, 6 movntq [r0+r2], m1 add r2, 8 jl .loop RET ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal hpel_filter_h, 3,3 add r0, r2 add r1, r2 neg r2 %define src r1+r2 pxor m0, m0 .loop: movd m1, [src-2] movd m2, [src-1] movd m3, [src ] movd m6, [src+1] movd m4, [src+2] movd m5, [src+3] punpcklbw m1, m0 punpcklbw m2, m0 punpcklbw m3, m0 punpcklbw m6, m0 punpcklbw m4, m0 punpcklbw m5, m0 paddw m3, m6 ; c0 paddw m2, m4 ; b0 paddw m1, m5 ; a0 movd m7, [src+7] movd m6, [src+6] punpcklbw m7, m0 punpcklbw m6, m0 paddw m4, m7 ; c1 paddw m5, m6 ; b1 movd m7, [src+5] movd m6, [src+4] punpcklbw m7, m0 punpcklbw m6, m0 paddw m6, m7 ; a1 movq m7, [pw_1] FILT_H2 m1, m2, m3, m4, m5, m6 FILT_PACK m1, m4, m7, 1 movntq [r0+r2], m1 add r2, 8 jl .loop RET %macro HPEL_C 0 ;----------------------------------------------------------------------------- ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_c, 3,3,9 add r0, r2 lea r1, [r1+r2*2] neg r2 %define src r1+r2*2 %ifnidn cpuname, sse2 %if cpuflag(ssse3) mova m7, [pw_512] %else mova m7, [pw_32] %endif %define pw_rnd m7 %elif ARCH_X86_64 mova m8, [pw_32] %define pw_rnd m8 %else %define pw_rnd [pw_32] %endif ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer... %if mmsize==32 .loop: movu m4, [src-4] movu m5, [src-2] mova m6, [src+0] movu m3, [src-4+mmsize] movu m2, [src-2+mmsize] mova m1, [src+0+mmsize] paddw m4, [src+6] paddw m5, [src+4] paddw m6, [src+2] paddw m3, [src+6+mmsize] paddw m2, [src+4+mmsize] paddw m1, [src+2+mmsize] FILT_H2 m4, m5, m6, m3, m2, m1 %else mova m0, [src-16] mova m1, [src] .loop: mova m2, [src+16] PALIGNR m4, m1, m0, 12, m7 PALIGNR m5, m1, m0, 14, m0 PALIGNR m0, m2, m1, 6, m7 paddw m4, m0 PALIGNR m0, m2, m1, 4, m7 paddw m5, m0 PALIGNR m6, m2, m1, 2, m7 paddw m6, m1 FILT_H m4, m5, m6 mova m0, m2 mova m5, m2 PALIGNR m2, m1, 12, m7 PALIGNR m5, m1, 14, m1 mova m1, [src+32] PALIGNR m3, m1, m0, 6, m7 paddw m3, m2 PALIGNR m6, m1, m0, 4, m7 paddw m5, m6 PALIGNR m6, m1, m0, 2, m7 paddw m6, m0 FILT_H m3, m5, m6 %endif FILT_PACK m4, m3, pw_rnd, 6 %if mmsize==32 vpermq m4, m4, q3120 %endif movnta [r0+r2], m4 add r2, mmsize jl .loop RET %endmacro ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal hpel_filter_h, 3,3,8 add r0, r2 add r1, r2 neg r2 %define src r1+r2 pxor m0, m0 .loop: movh m1, [src-2] movh m2, [src-1] movh m3, [src ] movh m4, [src+1] movh m5, [src+2] movh m6, [src+3] punpcklbw m1, m0 punpcklbw m2, m0 punpcklbw m3, m0 punpcklbw m4, m0 punpcklbw m5, m0 punpcklbw m6, m0 paddw m3, m4 ; c0 paddw m2, m5 ; b0 paddw m1, m6 ; a0 movh m4, [src+6] movh m5, [src+7] movh m6, [src+10] movh m7, [src+11] punpcklbw m4, m0 punpcklbw m5, m0 punpcklbw m6, m0 punpcklbw m7, m0 paddw m5, m6 ; b1 paddw m4, m7 ; a1 movh m6, [src+8] movh m7, [src+9] punpcklbw m6, m0 punpcklbw m7, m0 paddw m6, m7 ; c1 mova m7, [pw_1] ; FIXME xmm8 FILT_H2 m1, m2, m3, m4, m5, m6 FILT_PACK m1, m4, m7, 1 movntps [r0+r2], m1 add r2, 16 jl .loop RET ;----------------------------------------------------------------------------- ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- %macro HPEL_H 0 cglobal hpel_filter_h, 3,3 add r0, r2 add r1, r2 neg r2 %define src r1+r2 mova m0, [src-16] mova m1, [src] mova m7, [pw_1024] .loop: mova m2, [src+16] ; Using unaligned loads instead of palignr is marginally slower on SB and significantly ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid ; the repeated loads of constants for pmaddubsw. palignr m3, m1, m0, 14 palignr m4, m1, m0, 15 palignr m0, m2, m1, 2 pmaddubsw m3, [filt_mul15] pmaddubsw m4, [filt_mul15] pmaddubsw m0, [filt_mul51] palignr m5, m2, m1, 1 palignr m6, m2, m1, 3 paddw m3, m0 mova m0, m1 pmaddubsw m1, [filt_mul20] pmaddubsw m5, [filt_mul20] pmaddubsw m6, [filt_mul51] paddw m3, m1 paddw m4, m5 paddw m4, m6 FILT_PACK m3, m4, m7, 5 pshufb m3, [hpel_shuf] mova m1, m2 movntps [r0+r2], m3 add r2, 16 jl .loop RET %endmacro INIT_MMX mmx2 HPEL_V 0 INIT_XMM sse2 HPEL_V 8 %if ARCH_X86_64 == 0 INIT_XMM sse2 HPEL_C INIT_XMM ssse3 HPEL_C HPEL_V 0 HPEL_H INIT_XMM avx HPEL_C HPEL_V 0 HPEL_H INIT_YMM avx2 HPEL_V 8 HPEL_C INIT_YMM avx2 cglobal hpel_filter_h, 3,3,8 add r0, r2 add r1, r2 neg r2 %define src r1+r2 mova m5, [filt_mul15] mova m6, [filt_mul20] mova m7, [filt_mul51] .loop: movu m0, [src-2] movu m1, [src-1] movu m2, [src+2] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m7 paddw m0, m2 mova m2, [src+0] movu m3, [src+1] movu m4, [src+3] pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m4, m7 paddw m0, m2 paddw m1, m3 paddw m1, m4 mova m2, [pw_1024] FILT_PACK m0, m1, m2, 5 pshufb m0, [hpel_shuf] movnta [r0+r2], m0 add r2, mmsize jl .loop RET %endif %if ARCH_X86_64 %macro DO_FILT_V 5 ;The optimum prefetch distance is difficult to determine in checkasm: ;any prefetch seems slower than not prefetching. ;In real use, the prefetch seems to be a slight win. ;+mmsize is picked somewhat arbitrarily here based on the fact that even one ;loop iteration is going to take longer than the prefetch. prefetcht0 [r1+r2*2+mmsize] %if cpuflag(ssse3) mova m1, [r3] mova m2, [r3+r2] mova %3, [r3+r2*2] mova m3, [r1] mova %1, [r1+r2] mova %2, [r1+r2*2] punpckhbw m4, m1, m2 punpcklbw m1, m2 punpckhbw m2, %1, %2 punpcklbw %1, %2 punpckhbw %2, m3, %3 punpcklbw m3, %3 pmaddubsw m1, m12 pmaddubsw m4, m12 pmaddubsw %1, m0 pmaddubsw m2, m0 pmaddubsw m3, m14 pmaddubsw %2, m14 paddw m1, %1 paddw m4, m2 paddw m1, m3 paddw m4, %2 %else LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1 LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1 LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1 packuswb %3, %4 FILT_V2 m1, m2, m3, m4, m5, m6 %endif add r3, mmsize add r1, mmsize %if mmsize==32 vinserti128 %1, m1, xm4, 1 vperm2i128 %2, m1, m4, q0301 %else mova %1, m1 mova %2, m4 %endif FILT_PACK m1, m4, m15, 5 movntps [r8+r4+%5], m1 %endmacro %macro FILT_C 3 %if mmsize==32 vperm2i128 m3, %2, %1, q0003 %endif PALIGNR m1, %2, %1, (mmsize-4), m3 PALIGNR m2, %2, %1, (mmsize-2), m3 %if mmsize==32 vperm2i128 %1, %3, %2, q0003 %endif PALIGNR m3, %3, %2, 4, %1 PALIGNR m4, %3, %2, 2, %1 paddw m3, m2 %if mmsize==32 mova m2, %1 %endif mova %1, %3 PALIGNR %3, %3, %2, 6, m2 paddw m4, %2 paddw %3, m1 FILT_H %3, m3, m4 %endmacro %macro DO_FILT_C 4 FILT_C %1, %2, %3 FILT_C %2, %1, %4 FILT_PACK %3, %4, m15, 6 %if mmsize==32 vpermq %3, %3, q3120 %endif movntps [r5+r4], %3 %endmacro %macro ADD8TO16 5 punpckhbw %3, %1, %5 punpcklbw %1, %5 punpcklbw %4, %2, %5 punpckhbw %2, %5 paddw %2, %3 paddw %1, %4 %endmacro %macro DO_FILT_H 3 %if mmsize==32 vperm2i128 m3, %2, %1, q0003 %endif PALIGNR m1, %2, %1, (mmsize-2), m3 PALIGNR m2, %2, %1, (mmsize-1), m3 %if mmsize==32 vperm2i128 m3, %3, %2, q0003 %endif PALIGNR m4, %3, %2, 1 , m3 PALIGNR m5, %3, %2, 2 , m3 PALIGNR m6, %3, %2, 3 , m3 mova %1, %2 %if cpuflag(ssse3) pmaddubsw m1, m12 pmaddubsw m2, m12 pmaddubsw %2, m14 pmaddubsw m4, m14 pmaddubsw m5, m0 pmaddubsw m6, m0 paddw m1, %2 paddw m2, m4 paddw m1, m5 paddw m2, m6 FILT_PACK m1, m2, m15, 5 pshufb m1, [hpel_shuf] %else ; ssse3, avx ADD8TO16 m1, m6, m12, m3, m0 ; a ADD8TO16 m2, m5, m12, m3, m0 ; b ADD8TO16 %2, m4, m12, m3, m0 ; c FILT_V2 m1, m2, %2, m6, m5, m4 FILT_PACK m1, m6, m15, 5 %endif movntps [r0+r4], m1 mova %2, %3 %endmacro %macro HPEL 0 ;----------------------------------------------------------------------------- ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; uint8_t *src, intptr_t stride, int width, int height ) ;----------------------------------------------------------------------------- cglobal hpel_filter, 7,9,16 mov r7, r3 sub r5d, mmsize mov r8, r1 and r7, mmsize-1 sub r3, r7 add r0, r5 add r8, r5 add r7, r5 add r5, r2 mov r2, r4 neg r7 lea r1, [r3+r2] sub r3, r2 sub r3, r2 mov r4, r7 %if cpuflag(ssse3) mova m0, [filt_mul51] mova m12, [filt_mul15] mova m14, [filt_mul20] mova m15, [pw_1024] %else pxor m0, m0 mova m15, [pw_16] %endif ;ALIGN 16 .loopy: ; first filter_v DO_FILT_V m8, m7, m13, m12, 0 ;ALIGN 16 .loopx: DO_FILT_V m6, m5, m11, m12, mmsize .lastx: %if cpuflag(ssse3) psrlw m15, 1 ; pw_512 %else paddw m15, m15 ; pw_32 %endif DO_FILT_C m9, m8, m7, m6 %if cpuflag(ssse3) paddw m15, m15 ; pw_1024 %else psrlw m15, 1 ; pw_16 %endif mova m7, m5 DO_FILT_H m10, m13, m11 add r4, mmsize jl .loopx cmp r4, mmsize jl .lastx ; setup regs for next y sub r4, r7 sub r4, r2 sub r1, r4 sub r3, r4 add r0, r2 add r8, r2 add r5, r2 mov r4, r7 sub r6d, 1 jg .loopy sfence RET %endmacro INIT_XMM sse2 HPEL INIT_XMM ssse3 HPEL INIT_XMM avx HPEL INIT_YMM avx2 HPEL %endif ; ARCH_X86_64 %undef movntq %undef movntps %undef sfence %endif ; !HIGH_BIT_DEPTH %macro PREFETCHNT_ITER 2 ; src, bytes/iteration %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal? %rep (%2+63) / 64 ; assume 64 byte cache lines prefetchnta [%1+%%i] %assign %%i %%i + 64 %endrep %endmacro ;----------------------------------------------------------------------------- ; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of mmsize, and i_dst>w %macro PLANE_COPY_CORE 1 ; swap %if %1 cglobal plane_copy_swap_core, 6,7 %if mmsize == 32 vbroadcasti128 m4, [copy_swap_shuf] %else mova m4, [copy_swap_shuf] %endif %else cglobal plane_copy_core, 6,7 %endif FIX_STRIDES r1, r3 %if %1 && HIGH_BIT_DEPTH shl r4d, 2 %elif %1 || HIGH_BIT_DEPTH add r4d, r4d %else movsxdifnidn r4, r4d %endif add r0, r4 add r2, r4 neg r4 .loopy: lea r6, [r4+4*mmsize] %if %1 test r6d, r6d jg .skip %endif .loopx: PREFETCHNT_ITER r2+r6, 4*mmsize movu m0, [r2+r6-4*mmsize] movu m1, [r2+r6-3*mmsize] movu m2, [r2+r6-2*mmsize] movu m3, [r2+r6-1*mmsize] %if %1 pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 %endif movnta [r0+r6-4*mmsize], m0 movnta [r0+r6-3*mmsize], m1 movnta [r0+r6-2*mmsize], m2 movnta [r0+r6-1*mmsize], m3 add r6, 4*mmsize jle .loopx .skip: PREFETCHNT_ITER r2+r6, 4*mmsize sub r6, 4*mmsize jz .end .loop_end: movu m0, [r2+r6] %if %1 pshufb m0, m4 %endif movnta [r0+r6], m0 add r6, mmsize jl .loop_end .end: add r0, r1 add r2, r3 dec r5d jg .loopy sfence RET %endmacro INIT_XMM sse PLANE_COPY_CORE 0 INIT_XMM ssse3 PLANE_COPY_CORE 1 INIT_YMM avx PLANE_COPY_CORE 0 INIT_YMM avx2 PLANE_COPY_CORE 1 %macro PLANE_COPY_AVX512 1 ; swap %if %1 cglobal plane_copy_swap, 6,7 vbroadcasti32x4 m4, [copy_swap_shuf] %else cglobal plane_copy, 6,7 %endif movsxdifnidn r4, r4d %if %1 && HIGH_BIT_DEPTH %define %%mload vmovdqu32 lea r2, [r2+4*r4-64] lea r0, [r0+4*r4-64] neg r4 mov r6d, r4d shl r4, 2 or r6d, 0xffff0010 shrx r6d, r6d, r6d ; (1 << (w & 15)) - 1 kmovw k1, r6d %elif %1 || HIGH_BIT_DEPTH %define %%mload vmovdqu16 lea r2, [r2+2*r4-64] lea r0, [r0+2*r4-64] mov r6d, -1 neg r4 shrx r6d, r6d, r4d add r4, r4 kmovd k1, r6d %else %define %%mload vmovdqu8 lea r2, [r2+1*r4-64] lea r0, [r0+1*r4-64] mov r6, -1 neg r4 shrx r6, r6, r4 %if ARCH_X86_64 kmovq k1, r6 %else kmovd k1, r6d test r4d, 32 jnz .l32 kxnord k2, k2, k2 kunpckdq k1, k1, k2 .l32: %endif %endif FIX_STRIDES r3, r1 add r4, 4*64 jge .small mov r6, r4 .loop: ; >256 bytes/row PREFETCHNT_ITER r2+r4+64, 4*64 movu m0, [r2+r4-3*64] movu m1, [r2+r4-2*64] movu m2, [r2+r4-1*64] movu m3, [r2+r4-0*64] %if %1 pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 %endif movnta [r0+r4-3*64], m0 movnta [r0+r4-2*64], m1 movnta [r0+r4-1*64], m2 movnta [r0+r4-0*64], m3 add r4, 4*64 jl .loop PREFETCHNT_ITER r2+r4+64, 4*64 sub r4, 3*64 jge .tail .loop2: movu m0, [r2+r4] %if %1 pshufb m0, m4 %endif movnta [r0+r4], m0 add r4, 64 jl .loop2 .tail: %%mload m0 {k1}{z}, [r2+r4] %if %1 pshufb m0, m4 %endif movnta [r0+r4], m0 add r2, r3 add r0, r1 mov r4, r6 dec r5d jg .loop sfence RET .small: ; 65-256 bytes/row. skip non-temporal stores sub r4, 3*64 jge .tiny mov r6, r4 .small_loop: PREFETCHNT_ITER r2+r4+64, 64 movu m0, [r2+r4] %if %1 pshufb m0, m4 %endif mova [r0+r4], m0 add r4, 64 jl .small_loop PREFETCHNT_ITER r2+r4+64, 64 %%mload m0 {k1}{z}, [r2+r4] %if %1 pshufb m0, m4 %endif mova [r0+r4], m0 add r2, r3 add r0, r1 mov r4, r6 dec r5d jg .small_loop RET .tiny: ; 1-64 bytes/row. skip non-temporal stores PREFETCHNT_ITER r2+r4+64, 64 %%mload m0 {k1}{z}, [r2+r4] %if %1 pshufb m0, m4 %endif mova [r0+r4], m0 add r2, r3 add r0, r1 dec r5d jg .tiny RET %endmacro INIT_ZMM avx512 PLANE_COPY_AVX512 0 PLANE_COPY_AVX512 1 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH %assign x 0 %rep 16/mmsize mov%4 m0, [%2+(x/2)*mmsize] mov%4 m1, [%3+(x/2)*mmsize] punpckhwd m2, m0, m1 punpcklwd m0, m1 mov%5a [%1+(x+0)*mmsize], m0 mov%5a [%1+(x+1)*mmsize], m2 %assign x (x+2) %endrep %else movq m0, [%2] %if mmsize==16 %ifidn %4, a punpcklbw m0, [%3] %else movq m1, [%3] punpcklbw m0, m1 %endif mov%5a [%1], m0 %else movq m1, [%3] punpckhbw m2, m0, m1 punpcklbw m0, m1 mov%5a [%1+0], m0 mov%5a [%1+8], m2 %endif %endif ; HIGH_BIT_DEPTH %endmacro %macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned mov%6 m0, [%3] %if mmsize == 32 pshufb m0, %5 vpermq m0, m0, q3120 %if %4 mova [%1], m0 %else mov%6 [%1], xm0 vextracti128 [%2], m0, 1 %endif %elif HIGH_BIT_DEPTH mov%6 m1, [%3+mmsize] psrld m2, m0, 16 psrld m3, m1, 16 pand m0, %5 pand m1, %5 packssdw m0, m1 packssdw m2, m3 mov%6 [%1], m0 mov%6 [%2], m2 %else ; !HIGH_BIT_DEPTH %if cpuflag(ssse3) pshufb m0, %5 %else mova m1, m0 pand m0, %5 psrlw m1, 8 packuswb m0, m1 %endif %if %4 mova [%1], m0 %else movq [%1], m0 movhps [%2], m0 %endif %endif ; HIGH_BIT_DEPTH %endmacro %macro PLANE_INTERLEAVE 0 ;----------------------------------------------------------------------------- ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, ; uint8_t *srcu, intptr_t i_srcu, ; uint8_t *srcv, intptr_t i_srcv, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>2*w cglobal plane_copy_interleave_core, 6,9 mov r6d, r6m %if HIGH_BIT_DEPTH FIX_STRIDES r1, r3, r5, r6d movifnidn r1mp, r1 movifnidn r3mp, r3 mov r6m, r6d %endif lea r0, [r0+r6*2] add r2, r6 add r4, r6 %if ARCH_X86_64 DECLARE_REG_TMP 7,8 %else DECLARE_REG_TMP 1,3 %endif mov t1, r1 shr t1, SIZEOF_PIXEL sub t1, r6 mov t0d, r7m .loopy: mov r6d, r6m neg r6 .prefetch: prefetchnta [r2+r6] prefetchnta [r4+r6] add r6, 64 jl .prefetch mov r6d, r6m neg r6 .loopx: INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt add r6, 16*SIZEOF_PIXEL jl .loopx .pad: %assign n 0 %rep SIZEOF_PIXEL %if mmsize==8 movntq [r0+r6*2+(n+ 0)], m0 movntq [r0+r6*2+(n+ 8)], m0 movntq [r0+r6*2+(n+16)], m0 movntq [r0+r6*2+(n+24)], m0 %else movntdq [r0+r6*2+(n+ 0)], m0 movntdq [r0+r6*2+(n+16)], m0 %endif %assign n n+32 %endrep add r6, 16*SIZEOF_PIXEL cmp r6, t1 jl .pad add r0, r1mp add r2, r3mp add r4, r5 dec t0d jg .loopy sfence emms RET ;----------------------------------------------------------------------------- ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) ;----------------------------------------------------------------------------- cglobal store_interleave_chroma, 5,5 FIX_STRIDES r1 .loop: INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a add r2, FDEC_STRIDEB*2 add r3, FDEC_STRIDEB*2 lea r0, [r0+r1*2] sub r4d, 2 jg .loop RET %endmacro ; PLANE_INTERLEAVE %macro DEINTERLEAVE_START 0 %if mmsize == 32 vbroadcasti128 m4, [deinterleave_shuf] %elif HIGH_BIT_DEPTH mova m4, [pd_ffff] %elif cpuflag(ssse3) mova m4, [deinterleave_shuf] %else mova m4, [pw_00ff] %endif ; HIGH_BIT_DEPTH %endmacro %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- ; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta, ; pixel *dstb, intptr_t i_dstb, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 cglobal plane_copy_deinterleave, 6,9 %define %%w r7 %define %%h r8d mov r8d, r7m %else cglobal plane_copy_deinterleave, 6,7 %define %%w r6m %define %%h dword r7m %endif %if HIGH_BIT_DEPTH %assign %%n 16 %else %assign %%n mmsize/2 %endif DEINTERLEAVE_START mov r6d, r6m FIX_STRIDES r1, r3, r5, r6d add r0, r6 add r2, r6 lea r4, [r4+r6*2] neg r6 mov %%w, r6 .loop: DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, m4, u DEINTERLEAVE r0+r6+%%n, r2+r6+%%n, r4+r6*2+%%n*2, 0, m4, u add r6, %%n*2 jl .loop add r0, r1 add r2, r3 add r4, r5 mov r6, %%w dec %%h jg .loop RET %endmacro ; PLANE_DEINTERLEAVE %macro LOAD_DEINTERLEAVE_CHROMA 0 ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fenc, 4,4 DEINTERLEAVE_START FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a add r0, FENC_STRIDEB*2 lea r1, [r1+r2*2] sub r3d, 2 jg .loop RET ;----------------------------------------------------------------------------- ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fdec, 4,4 DEINTERLEAVE_START FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a add r0, FDEC_STRIDEB*2 lea r1, [r1+r2*2] sub r3d, 2 jg .loop RET %endmacro ; LOAD_DEINTERLEAVE_CHROMA %macro LOAD_DEINTERLEAVE_CHROMA_FDEC_AVX512 0 cglobal load_deinterleave_chroma_fdec, 4,5 vbroadcasti32x8 m0, [deinterleave_shuf32a] mov r4d, 0x3333ff00 kmovd k1, r4d lea r4, [r2*3] kshiftrd k2, k1, 16 .loop: vbroadcasti128 ym1, [r1] vbroadcasti32x4 m1 {k1}, [r1+r2] vbroadcasti128 ym2, [r1+r2*2] vbroadcasti32x4 m2 {k1}, [r1+r4] lea r1, [r1+r2*4] pshufb m1, m0 pshufb m2, m0 vmovdqa32 [r0] {k2}, m1 vmovdqa32 [r0+mmsize] {k2}, m2 add r0, 2*mmsize sub r3d, 4 jg .loop RET %endmacro %macro LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 0 cglobal load_deinterleave_chroma_fenc, 4,5 vbroadcasti128 m0, [deinterleave_shuf] lea r4, [r2*3] .loop: mova xm1, [r1] ; 0 vinserti128 ym1, [r1+r2], 1 ; 1 %if mmsize == 64 mova xm2, [r1+r2*4] ; 4 vinserti32x4 m1, [r1+r2*2], 2 ; 2 vinserti32x4 m2, [r1+r4*2], 2 ; 6 vinserti32x4 m1, [r1+r4], 3 ; 3 lea r1, [r1+r2*4] vinserti32x4 m2, [r1+r2], 1 ; 5 vinserti32x4 m2, [r1+r4], 3 ; 7 %else mova xm2, [r1+r2*2] ; 2 vinserti128 m2, [r1+r4], 1 ; 3 %endif lea r1, [r1+r2*4] pshufb m1, m0 pshufb m2, m0 mova [r0], m1 mova [r0+mmsize], m2 add r0, 2*mmsize sub r3d, mmsize/8 jg .loop RET %endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2 %if mmsize == 32 vbroadcasti128 m3, [deinterleave_rgb_shuf+(%1-3)*16] %elif cpuflag(ssse3) mova m3, [deinterleave_rgb_shuf+(%1-3)*16] %endif %%loopy: mov %8, r6 mov %9, %6 %%loopx: %if mmsize == 32 && %1 == 3 movu xm0, [%8+0*12] vinserti128 m0, m0, [%8+1*12], 1 movu xm1, [%8+2*12] vinserti128 m1, m1, [%8+3*12], 1 %else movu m0, [%8] movu m1, [%8+%1*mmsize/4] %endif %if cpuflag(ssse3) pshufb m0, m3 ; a0 a1 a2 a3 a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 pshufb m1, m3 ; a4 a5 a6 a7 a4 a5 a6 a7 b4 b5 b6 b7 c4 c5 c6 c7 %if mmsize == 32 vpblendd m2, m0, m1, 0x22 punpckhdq m0, m1 vpermd m2, m4, m2 vpermd m0, m4, m0 mova [r0+%9], xm2 mova [r2+%9], xm0 vextracti128 [r4+%9], m0, 1 %else SBUTTERFLY dq, 0, 1, 2 movq [r0+%9], m0 movq [r2+%9], m1 movhps [r4+%9], m1 %endif %elif %1 == 3 SBUTTERFLY bw, 0, 1, 2 pshufd m2, m0, q0321 ; c0 c4 a1 a5 b1 b5 c1 c5 __ __ __ __ a0 a4 b0 b4 punpcklbw m3, m2, m1 ; c0 c2 c4 c6 a1 a3 a5 a7 b1 b3 b5 b7 c1 c3 c5 c7 punpckhbw m2, m0 ; __ __ __ __ __ __ __ __ a0 a2 a4 a6 b0 b2 b4 b6 pshufd m0, m3, q2103 ; c1 c3 c5 c7 __ __ __ __ a1 a3 a5 a7 b1 b3 b5 b7 punpckhbw m2, m0 ; a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7 punpcklbw m3, m0 ; c0 c1 c2 c3 c4 c5 c6 c7 movq [r0+%9], m2 movhps [r2+%9], m2 movq [r4+%9], m3 %else ; %1 == 4 SBUTTERFLY bw, 0, 1, 2 SBUTTERFLY bw, 0, 1, 2 SBUTTERFLY bw, 0, 1, 2 movq [r0+%9], m0 movhps [r2+%9], m0 movq [r4+%9], m1 %endif add %8, %1*mmsize/2 add %9, mmsize/2 jl %%loopx add r0, %2 add r2, %3 add r4, %4 add r6, %5 dec %7d jg %%loopy %endmacro %macro PLANE_DEINTERLEAVE_RGB 0 ;----------------------------------------------------------------------------- ; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta, ; pixel *dstb, intptr_t i_dstb, ; pixel *dstc, intptr_t i_dstc, ; pixel *src, intptr_t i_src, int pw, int w, int h ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 cglobal plane_copy_deinterleave_rgb, 8,12 %define %%args r1, r3, r5, r7, r8, r9, r10, r11 mov r8d, r9m mov r9d, r10m add r0, r8 add r2, r8 add r4, r8 neg r8 %else cglobal plane_copy_deinterleave_rgb, 1,7 %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5 mov r1, r9m mov r2, r2m mov r4, r4m mov r6, r6m add r0, r1 add r2, r1 add r4, r1 neg r1 mov r9m, r1 mov r1, r10m %endif %if mmsize == 32 mova m4, [deinterleave_shufd] %endif cmp dword r8m, 4 je .pw4 PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR jmp .ret .pw4: PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA .ret: REP_RET %endmacro %macro PLANE_DEINTERLEAVE_V210 0 ;----------------------------------------------------------------------------- ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty, ; uint16_t *dstc, intptr_t i_dstc, ; uint32_t *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 cglobal plane_copy_deinterleave_v210, 8,10,7 %define src r8 %define org_w r9 %define h r7d %else cglobal plane_copy_deinterleave_v210, 7,7,7 %define src r4m %define org_w r6m %define h dword r7m %endif FIX_STRIDES r1, r3, r6d shl r5, 2 add r0, r6 add r2, r6 neg r6 mov src, r4 mov org_w, r6 %if cpuflag(avx512) vpbroadcastd m2, [v210_mask] vpbroadcastd m3, [v210_shuf_avx512] psrlw m3, 6 ; dw 0, 4 mova m4, [v210_shuf_avx512] ; luma psrlw m5, m4, 8 ; chroma %else %if mmsize == 32 vbroadcasti128 m2, [v210_mask] vbroadcasti128 m3, [v210_luma_shuf] vbroadcasti128 m4, [v210_chroma_shuf] %else mova m2, [v210_mask] mova m3, [v210_luma_shuf] mova m4, [v210_chroma_shuf] %endif mova m5, [v210_mult] ; also functions as vpermd index for avx2 pshufd m6, m5, q1102 %endif ALIGN 16 .loop: movu m1, [r4] pandn m0, m2, m1 pand m1, m2 %if cpuflag(avx512) psrld m0, 10 vpsrlvw m1, m3 mova m6, m0 vpermt2w m0, m4, m1 vpermt2w m1, m5, m6 %else pshufb m0, m3 pshufb m1, m4 pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ %if mmsize == 32 vpermd m0, m5, m0 vpermd m1, m5, m1 %endif %endif movu [r0+r6], m0 movu [r2+r6], m1 add r4, mmsize add r6, mmsize*3/4 jl .loop add r0, r1 add r2, r3 add src, r5 mov r4, src mov r6, org_w dec h jg .loop RET %endmacro ; PLANE_DEINTERLEAVE_V210 INIT_MMX mmx2 PLANE_INTERLEAVE INIT_XMM sse2 PLANE_INTERLEAVE PLANE_DEINTERLEAVE LOAD_DEINTERLEAVE_CHROMA INIT_YMM avx2 PLANE_DEINTERLEAVE %if HIGH_BIT_DEPTH INIT_XMM ssse3 PLANE_DEINTERLEAVE_V210 INIT_XMM avx PLANE_INTERLEAVE PLANE_DEINTERLEAVE LOAD_DEINTERLEAVE_CHROMA PLANE_DEINTERLEAVE_V210 INIT_YMM avx2 LOAD_DEINTERLEAVE_CHROMA PLANE_DEINTERLEAVE_V210 INIT_ZMM avx512 PLANE_DEINTERLEAVE_V210 %else INIT_XMM sse2 PLANE_DEINTERLEAVE_RGB INIT_XMM ssse3 PLANE_DEINTERLEAVE LOAD_DEINTERLEAVE_CHROMA PLANE_DEINTERLEAVE_RGB INIT_YMM avx2 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 PLANE_DEINTERLEAVE_RGB INIT_ZMM avx512 LOAD_DEINTERLEAVE_CHROMA_FDEC_AVX512 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 %endif ; These functions are not general-use; not only do they require aligned input, but memcpy ; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128. ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- %macro MEMCPY 0 cglobal memcpy_aligned, 3,3 %if mmsize == 32 test r2d, 16 jz .copy32 mova xm0, [r1+r2-16] mova [r0+r2-16], xm0 sub r2d, 16 jle .ret .copy32: %endif test r2d, mmsize jz .loop mova m0, [r1+r2-mmsize] mova [r0+r2-mmsize], m0 sub r2d, mmsize jle .ret .loop: mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 sub r2d, 2*mmsize jg .loop .ret: RET %endmacro ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- %macro MEMZERO 0 cglobal memzero_aligned, 2,2 xorps m0, m0 .loop: %assign %%i mmsize %rep 128 / mmsize movaps [r0 + r1 - %%i], m0 %assign %%i %%i+mmsize %endrep sub r1d, 128 jg .loop RET %endmacro INIT_XMM sse MEMCPY MEMZERO INIT_YMM avx MEMCPY MEMZERO INIT_ZMM avx512 MEMZERO cglobal memcpy_aligned, 3,4 dec r2d ; offset of the last byte rorx r3d, r2d, 2 and r2d, ~63 and r3d, 15 ; n = number of dwords minus one to copy in the tail mova m0, [r1+r2] not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff shrx r3d, r3d, r3d ; 0xffff >> (n^15) kmovw k1, r3d ; (1 << (n+1)) - 1 vmovdqa32 [r0+r2] {k1}, m0 sub r2d, 64 jl .ret .loop: mova m0, [r1+r2] mova [r0+r2], m0 sub r2d, 64 jge .loop .ret: RET %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) ;----------------------------------------------------------------------------- %macro INTEGRAL_INIT4H 0 cglobal integral_init4h, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 pxor m4, m4 .loop: mova xm0, [r1+r2] mova xm1, [r1+r2+16] %if mmsize==32 vinserti128 m0, m0, [r1+r2+ 8], 1 vinserti128 m1, m1, [r1+r2+24], 1 %else palignr m1, m0, 8 %endif mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] paddw m1, [r0+r2*2+mmsize] mova [r3+r2*2 ], m0 mova [r3+r2*2+mmsize], m1 add r2, mmsize jl .loop RET %endmacro INIT_XMM sse4 INTEGRAL_INIT4H INIT_YMM avx2 INTEGRAL_INIT4H %macro INTEGRAL_INIT8H 0 cglobal integral_init8h, 3,4 lea r3, [r0+r2*2] add r1, r2 neg r2 pxor m4, m4 .loop: mova xm0, [r1+r2] mova xm1, [r1+r2+16] %if mmsize==32 vinserti128 m0, m0, [r1+r2+ 8], 1 vinserti128 m1, m1, [r1+r2+24], 1 mpsadbw m2, m0, m4, 100100b mpsadbw m3, m1, m4, 100100b %else palignr m1, m0, 8 mpsadbw m2, m0, m4, 100b mpsadbw m3, m1, m4, 100b %endif mpsadbw m0, m4, 0 mpsadbw m1, m4, 0 paddw m0, [r0+r2*2] paddw m1, [r0+r2*2+mmsize] paddw m0, m2 paddw m1, m3 mova [r3+r2*2 ], m0 mova [r3+r2*2+mmsize], m1 add r2, mmsize jl .loop RET %endmacro INIT_XMM sse4 INTEGRAL_INIT8H INIT_XMM avx INTEGRAL_INIT8H INIT_YMM avx2 INTEGRAL_INIT8H %endif ; !HIGH_BIT_DEPTH %macro INTEGRAL_INIT_8V 0 ;----------------------------------------------------------------------------- ; void integral_init8v( uint16_t *sum8, intptr_t stride ) ;----------------------------------------------------------------------------- cglobal integral_init8v, 3,3 add r1, r1 add r0, r1 lea r2, [r0+r1*8] neg r1 .loop: mova m0, [r2+r1] mova m1, [r2+r1+mmsize] psubw m0, [r0+r1] psubw m1, [r0+r1+mmsize] mova [r0+r1], m0 mova [r0+r1+mmsize], m1 add r1, 2*mmsize jl .loop RET %endmacro INIT_MMX mmx INTEGRAL_INIT_8V INIT_XMM sse2 INTEGRAL_INIT_8V INIT_YMM avx2 INTEGRAL_INIT_8V ;----------------------------------------------------------------------------- ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) ;----------------------------------------------------------------------------- INIT_MMX mmx cglobal integral_init4v, 3,5 shl r2, 1 lea r3, [r0+r2*4] lea r4, [r0+r2*8] mova m0, [r0+r2] mova m4, [r4+r2] .loop: mova m1, m4 psubw m1, m0 mova m4, [r4+r2-8] mova m0, [r0+r2-8] paddw m1, m4 mova m3, [r3+r2-8] psubw m1, m0 psubw m3, m0 mova [r0+r2-8], m1 mova [r1+r2-8], m3 sub r2, 8 jge .loop RET INIT_XMM sse2 cglobal integral_init4v, 3,5 shl r2, 1 add r0, r2 add r1, r2 lea r3, [r0+r2*4] lea r4, [r0+r2*8] neg r2 .loop: mova m0, [r0+r2] mova m1, [r4+r2] mova m2, m0 mova m4, m1 shufpd m0, [r0+r2+16], 1 shufpd m1, [r4+r2+16], 1 paddw m0, m2 paddw m1, m4 mova m3, [r3+r2] psubw m1, m0 psubw m3, m2 mova [r0+r2], m1 mova [r1+r2], m3 add r2, 16 jl .loop RET INIT_XMM ssse3 cglobal integral_init4v, 3,5 shl r2, 1 add r0, r2 add r1, r2 lea r3, [r0+r2*4] lea r4, [r0+r2*8] neg r2 .loop: mova m2, [r0+r2] mova m0, [r0+r2+16] mova m4, [r4+r2] mova m1, [r4+r2+16] palignr m0, m2, 8 palignr m1, m4, 8 paddw m0, m2 paddw m1, m4 mova m3, [r3+r2] psubw m1, m0 psubw m3, m2 mova [r0+r2], m1 mova [r1+r2], m3 add r2, 16 jl .loop RET INIT_YMM avx2 cglobal integral_init4v, 3,5 add r2, r2 add r0, r2 add r1, r2 lea r3, [r0+r2*4] lea r4, [r0+r2*8] neg r2 .loop: mova m2, [r0+r2] movu m1, [r4+r2+8] paddw m0, m2, [r0+r2+8] paddw m1, [r4+r2] mova m3, [r3+r2] psubw m1, m0 psubw m3, m2 mova [r0+r2], m1 mova [r1+r2], m3 add r2, 32 jl .loop RET %macro FILT8x4 7 mova %3, [r0+%7] mova %4, [r0+r5+%7] pavgb %3, %4 pavgb %4, [r0+r5*2+%7] PALIGNR %1, %3, 1, m6 PALIGNR %2, %4, 1, m6 %if cpuflag(xop) pavgb %1, %3 pavgb %2, %4 %else pavgb %1, %3 pavgb %2, %4 psrlw %5, %1, 8 psrlw %6, %2, 8 pand %1, m7 pand %2, m7 %endif %endmacro %macro FILT32x4U 4 mova m1, [r0+r5] pavgb m0, m1, [r0] movu m3, [r0+r5+1] pavgb m2, m3, [r0+1] pavgb m1, [r0+r5*2] pavgb m3, [r0+r5*2+1] pavgb m0, m2 pavgb m1, m3 mova m3, [r0+r5+mmsize] pavgb m2, m3, [r0+mmsize] movu m5, [r0+r5+1+mmsize] pavgb m4, m5, [r0+1+mmsize] pavgb m3, [r0+r5*2+mmsize] pavgb m5, [r0+r5*2+1+mmsize] pavgb m2, m4 pavgb m3, m5 pshufb m0, m7 pshufb m1, m7 pshufb m2, m7 pshufb m3, m7 punpckhqdq m4, m0, m2 punpcklqdq m0, m0, m2 punpckhqdq m5, m1, m3 punpcklqdq m2, m1, m3 vpermq m0, m0, q3120 vpermq m1, m4, q3120 vpermq m2, m2, q3120 vpermq m3, m5, q3120 mova [%1], m0 mova [%2], m1 mova [%3], m2 mova [%4], m3 %endmacro %macro FILT16x2 4 mova m3, [r0+%4+mmsize] mova m2, [r0+%4] pavgb m3, [r0+%4+r5+mmsize] pavgb m2, [r0+%4+r5] PALIGNR %1, m3, 1, m6 pavgb %1, m3 PALIGNR m3, m2, 1, m6 pavgb m3, m2 %if cpuflag(xop) vpperm m5, m3, %1, m7 vpperm m3, m3, %1, m6 %else psrlw m5, m3, 8 psrlw m4, %1, 8 pand m3, m7 pand %1, m7 packuswb m3, %1 packuswb m5, m4 %endif mova [%2], m3 mova [%3], m5 mova %1, m2 %endmacro %macro FILT8x2U 3 mova m3, [r0+%3+8] mova m2, [r0+%3] pavgb m3, [r0+%3+r5+8] pavgb m2, [r0+%3+r5] mova m1, [r0+%3+9] mova m0, [r0+%3+1] pavgb m1, [r0+%3+r5+9] pavgb m0, [r0+%3+r5+1] pavgb m1, m3 pavgb m0, m2 psrlw m3, m1, 8 psrlw m2, m0, 8 pand m1, m7 pand m0, m7 packuswb m0, m1 packuswb m2, m3 mova [%1], m0 mova [%2], m2 %endmacro %macro FILT8xU 3 mova m3, [r0+%3+8] mova m2, [r0+%3] pavgw m3, [r0+%3+r5+8] pavgw m2, [r0+%3+r5] movu m1, [r0+%3+10] movu m0, [r0+%3+2] pavgw m1, [r0+%3+r5+10] pavgw m0, [r0+%3+r5+2] pavgw m1, m3 pavgw m0, m2 psrld m3, m1, 16 psrld m2, m0, 16 pand m1, m7 pand m0, m7 packssdw m0, m1 packssdw m2, m3 movu [%1], m0 mova [%2], m2 %endmacro %macro FILT8xA 4 mova m3, [r0+%4+mmsize] mova m2, [r0+%4] pavgw m3, [r0+%4+r5+mmsize] pavgw m2, [r0+%4+r5] PALIGNR %1, m3, 2, m6 pavgw %1, m3 PALIGNR m3, m2, 2, m6 pavgw m3, m2 %if cpuflag(xop) vpperm m5, m3, %1, m7 vpperm m3, m3, %1, m6 %else psrld m5, m3, 16 psrld m4, %1, 16 pand m3, m7 pand %1, m7 packssdw m3, %1 packssdw m5, m4 %endif mova [%2], m3 mova [%3], m5 mova %1, m2 %endmacro ;----------------------------------------------------------------------------- ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; intptr_t src_stride, intptr_t dst_stride, int width, int height ) ;----------------------------------------------------------------------------- %macro FRAME_INIT_LOWRES 0 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise %if HIGH_BIT_DEPTH shl dword r6m, 1 FIX_STRIDES r5 shl dword r7m, 1 %endif %if mmsize >= 16 add dword r7m, mmsize-1 and dword r7m, ~(mmsize-1) %endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m dec r6d imul r6d, r5d add r6d, r7m lea r0, [r0+r6*2] ; dst += (height-1)*stride + width mov r6d, r8m dec r6d imul r6d, r6m add r6d, r7m add r1, r6 add r2, r6 add r3, r6 add r4, r6 ; gap = stride - width mov r6d, r6m sub r6d, r7m PUSH r6 %define dst_gap [rsp+gprsize] mov r6d, r5d sub r6d, r7m shl r6d, 1 PUSH r6 %define src_gap [rsp] %if HIGH_BIT_DEPTH %if cpuflag(xop) mova m6, [deinterleave_shuf32a] mova m7, [deinterleave_shuf32b] %else pcmpeqw m7, m7 psrld m7, 16 %endif .vloop: mov r6d, r7m %ifnidn cpuname, mmx2 mova m0, [r0] mova m1, [r0+r5] pavgw m0, m1 pavgw m1, [r0+r5*2] %endif .hloop: sub r0, mmsize*2 sub r1, mmsize sub r2, mmsize sub r3, mmsize sub r4, mmsize %ifidn cpuname, mmx2 FILT8xU r1, r2, 0 FILT8xU r3, r4, r5 %else FILT8xA m0, r1, r2, 0 FILT8xA m1, r3, r4, r5 %endif sub r6d, mmsize jg .hloop %else ; !HIGH_BIT_DEPTH %if cpuflag(avx2) vbroadcasti128 m7, [deinterleave_shuf] %elif cpuflag(xop) mova m6, [deinterleave_shuf32a] mova m7, [deinterleave_shuf32b] %else pcmpeqb m7, m7 psrlw m7, 8 %endif .vloop: mov r6d, r7m %ifnidn cpuname, mmx2 %if mmsize <= 16 mova m0, [r0] mova m1, [r0+r5] pavgb m0, m1 pavgb m1, [r0+r5*2] %endif %endif .hloop: sub r0, mmsize*2 sub r1, mmsize sub r2, mmsize sub r3, mmsize sub r4, mmsize %if mmsize==32 FILT32x4U r1, r2, r3, r4 %elifdef m8 FILT8x4 m0, m1, m2, m3, m10, m11, mmsize mova m8, m0 mova m9, m1 FILT8x4 m2, m3, m0, m1, m4, m5, 0 %if cpuflag(xop) vpperm m4, m2, m8, m7 vpperm m2, m2, m8, m6 vpperm m5, m3, m9, m7 vpperm m3, m3, m9, m6 %else packuswb m2, m8 packuswb m3, m9 packuswb m4, m10 packuswb m5, m11 %endif mova [r1], m2 mova [r2], m4 mova [r3], m3 mova [r4], m5 %elifidn cpuname, mmx2 FILT8x2U r1, r2, 0 FILT8x2U r3, r4, r5 %else FILT16x2 m0, r1, r2, 0 FILT16x2 m1, r3, r4, r5 %endif sub r6d, mmsize jg .hloop %endif ; HIGH_BIT_DEPTH .skip: mov r6, dst_gap sub r0, src_gap sub r1, r6 sub r2, r6 sub r3, r6 sub r4, r6 dec dword r8m jg .vloop ADD rsp, 2*gprsize emms RET %endmacro ; FRAME_INIT_LOWRES INIT_MMX mmx2 FRAME_INIT_LOWRES %if ARCH_X86_64 == 0 INIT_MMX cache32, mmx2 FRAME_INIT_LOWRES %endif INIT_XMM sse2 FRAME_INIT_LOWRES INIT_XMM ssse3 FRAME_INIT_LOWRES INIT_XMM avx FRAME_INIT_LOWRES INIT_XMM xop FRAME_INIT_LOWRES %if HIGH_BIT_DEPTH==0 INIT_YMM avx2 FRAME_INIT_LOWRES %endif ;----------------------------------------------------------------------------- ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) ;----------------------------------------------------------------------------- %macro MBTREE 0 cglobal mbtree_propagate_cost, 6,6,7 movss m6, [r5] mov r5d, r6m lea r0, [r0+r5*2] add r5d, r5d add r1, r5 add r2, r5 add r3, r5 add r4, r5 neg r5 pxor m4, m4 shufps m6, m6, 0 mova m5, [pw_3fff] .loop: movq m2, [r2+r5] ; intra movq m0, [r4+r5] ; invq movq m3, [r3+r5] ; inter movq m1, [r1+r5] ; prop pand m3, m5 pminsw m3, m2 punpcklwd m2, m4 punpcklwd m0, m4 pmaddwd m0, m2 punpcklwd m1, m4 punpcklwd m3, m4 %if cpuflag(fma4) cvtdq2ps m0, m0 cvtdq2ps m1, m1 fmaddps m0, m0, m6, m1 cvtdq2ps m1, m2 psubd m2, m3 cvtdq2ps m2, m2 rcpps m3, m1 mulps m1, m3 mulps m0, m2 addps m2, m3, m3 fnmaddps m3, m1, m3, m2 mulps m0, m3 %else cvtdq2ps m0, m0 mulps m0, m6 ; intra*invq*fps_factor>>8 cvtdq2ps m1, m1 ; prop addps m0, m1 ; prop + (intra*invq*fps_factor>>8) cvtdq2ps m1, m2 ; intra psubd m2, m3 ; intra - inter cvtdq2ps m2, m2 ; intra - inter rcpps m3, m1 ; 1 / intra 1st approximation mulps m1, m3 ; intra * (1/intra 1st approx) mulps m1, m3 ; intra * (1/intra 1st approx)^2 mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) addps m3, m3 ; 2 * (1/intra 1st approx) subps m3, m1 ; 2nd approximation for 1/intra mulps m0, m3 ; / intra %endif cvtps2dq m0, m0 packssdw m0, m0 movh [r0+r5], m0 add r5, 8 jl .loop RET %endmacro INIT_XMM sse2 MBTREE ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower. INIT_XMM fma4 MBTREE %macro INT16_UNPACK 1 punpckhwd xm6, xm%1, xm7 punpcklwd xm%1, xm7 vinsertf128 m%1, m%1, xm6, 1 %endmacro ; FIXME: align loads to 16 bytes %macro MBTREE_AVX 0 cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) vbroadcastss m5, [r5] mov r5d, r6m lea r2, [r2+r5*2] add r5d, r5d add r4, r5 neg r5 sub r1, r5 sub r3, r5 sub r0, r5 mova xm4, [pw_3fff] %if notcpuflag(avx2) pxor xm7, xm7 %endif .loop: %if cpuflag(avx2) pmovzxwd m0, [r2+r5] ; intra pmovzxwd m1, [r4+r5] ; invq pmovzxwd m2, [r1+r5] ; prop pand xm3, xm4, [r3+r5] ; inter pmovzxwd m3, xm3 pmaddwd m1, m0 psubusw m3, m0, m3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 fmaddps m1, m1, m5, m2 rcpps m2, m0 mulps m0, m2 mulps m1, m3 addps m3, m2, m2 fnmaddps m2, m2, m0, m3 mulps m1, m2 %else movu xm0, [r2+r5] movu xm1, [r4+r5] movu xm2, [r1+r5] pand xm3, xm4, [r3+r5] psubusw xm3, xm0, xm3 INT16_UNPACK 0 INT16_UNPACK 1 INT16_UNPACK 2 INT16_UNPACK 3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 mulps m1, m0 mulps m1, m5 ; intra*invq*fps_factor>>8 addps m1, m2 ; prop + (intra*invq*fps_factor>>8) rcpps m2, m0 ; 1 / intra 1st approximation mulps m0, m2 ; intra * (1/intra 1st approx) mulps m0, m2 ; intra * (1/intra 1st approx)^2 mulps m1, m3 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) addps m2, m2 ; 2 * (1/intra 1st approx) subps m2, m0 ; 2nd approximation for 1/intra mulps m1, m2 ; / intra %endif cvtps2dq m1, m1 vextractf128 xm2, m1, 1 packssdw xm1, xm2 mova [r0+r5], xm1 add r5, 16 jl .loop RET %endmacro INIT_YMM avx MBTREE_AVX INIT_YMM avx2 MBTREE_AVX INIT_ZMM avx512 cglobal mbtree_propagate_cost, 6,6 vbroadcastss m5, [r5] mov r5d, 0x3fff3fff vpbroadcastd ym4, r5d mov r5d, r6m lea r2, [r2+r5*2] add r5d, r5d add r1, r5 neg r5 sub r4, r5 sub r3, r5 sub r0, r5 .loop: pmovzxwd m0, [r2+r5] ; intra pmovzxwd m1, [r1+r5] ; prop pmovzxwd m2, [r4+r5] ; invq pand ym3, ym4, [r3+r5] ; inter pmovzxwd m3, ym3 psubusw m3, m0, m3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 vdivps m1, m0, {rn-sae} fmaddps m1, m2, m5, m1 mulps m1, m3 cvtps2dq m1, m1 vpmovsdw [r0+r5], m1 add r5, 32 jl .loop RET %macro MBTREE_PROPAGATE_LIST 0 ;----------------------------------------------------------------------------- ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, ; int16_t *output, int bipred_weight, int mb_y, int len ) ;----------------------------------------------------------------------------- cglobal mbtree_propagate_list_internal, 4,6,8 movh m6, [pw_0to15] ; mb_x movd m7, r5m pshuflw m7, m7, 0 punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y movd m7, r4m SPLATW m7, m7 ; bipred_weight psllw m7, 9 ; bipred_weight << 9 mov r5d, r6m xor r4d, r4d .loop: mova m3, [r1+r4*2] movu m4, [r2+r4*2] mova m5, [pw_0xc000] pand m4, m5 pcmpeqw m4, m5 pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 %if cpuflag(avx) pblendvb m5, m3, m5, m4 %else pand m5, m4 pandn m4, m3 por m5, m4 ; if( lists_used == 3 ) ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 %endif movu m0, [r0+r4*4] ; x,y movu m1, [r0+r4*4+mmsize] psraw m2, m0, 5 psraw m3, m1, 5 mova m4, [pd_4] paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} paddw m6, m4 ; {mbx, mby} += {4, 0} paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} paddw m6, m4 ; {mbx, mby} += {4, 0} mova [r3+mmsize*0], m2 mova [r3+mmsize*1], m3 mova m3, [pw_31] pand m0, m3 ; x &= 31 pand m1, m3 ; y &= 31 packuswb m0, m1 psrlw m1, m0, 3 pand m0, m3 ; x SWAP 1, 3 pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw mova m3, [pw_32] psubw m3, m0 ; 32 - x mova m4, [pw_1024] psubw m4, m1 ; (32 - y) << 5 pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5 pmullw m4, m0 ; idx1weight = (32-y)*x << 5 pmullw m0, m1 ; idx3weight = y*x << 5 pmullw m1, m3 ; idx2weight = y*(32-x) << 5 ; avoid overflow in the input to pmulhrsw psrlw m3, m2, 15 psubw m2, m3 ; idx0weight -= (idx0weight == 32768) pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10 pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10 pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10 pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10 SBUTTERFLY wd, 2, 4, 3 SBUTTERFLY wd, 1, 0, 3 mova [r3+mmsize*2], m2 mova [r3+mmsize*3], m4 mova [r3+mmsize*4], m1 mova [r3+mmsize*5], m0 add r4d, mmsize/2 add r3, mmsize*6 cmp r4d, r5d jl .loop REP_RET %endmacro INIT_XMM ssse3 MBTREE_PROPAGATE_LIST INIT_XMM avx MBTREE_PROPAGATE_LIST INIT_YMM avx2 cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8 mova xm4, [pw_0xc000] %if UNIX64 shl r4d, 9 shl r5d, 16 movd xm5, r4d movd xm6, r5d vpbroadcastw xm5, xm5 vpbroadcastd m6, xm6 %else vpbroadcastw xm5, r4m vpbroadcastd m6, r5m psllw xm5, 9 ; bipred_weight << 9 pslld m6, 16 %endif mov r4d, r6m lea r1, [r1+r4*2] lea r2, [r2+r4*2] lea r0, [r0+r4*4] neg r4 por m6, [pd_0123] ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y vbroadcasti128 m7, [pw_31] .loop: mova xm3, [r1+r4*2] pand xm0, xm4, [r2+r4*2] pmulhrsw xm1, xm3, xm5 ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6 pcmpeqw xm0, xm4 pblendvb xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount vpermq m3, m3, q1100 movu m0, [r0+r4*4] ; {x, y} vbroadcasti128 m1, [pd_8] psraw m2, m0, 5 paddw m2, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y} paddw m6, m1 ; i_mb_x += 8 mova [r3], m2 mova m1, [pw_32] pand m0, m7 psubw m1, m0 packuswb m1, m0 ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y} psrlw m0, m1, 3 pand m1, [pw_00ff] ; 32-x x 32-x x pandn m0, m7, m0 ; (32-y y 32-y y) << 5 pshufd m2, m1, q1032 pmullw m1, m0 ; idx0 idx3 idx0 idx3 pmullw m2, m0 ; idx1 idx2 idx1 idx2 pmulhrsw m0, m1, m3 ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10 pmulhrsw m2, m3 ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10 psignw m0, m1 ; correct potential overflow in the idx0 input to pmulhrsw punpcklwd m1, m0, m2 ; idx01weight punpckhwd m2, m0 ; idx23weight mova [r3+32], m1 mova [r3+64], m2 add r3, 3*mmsize add r4, 8 jl .loop RET %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, ; uint16_t *lowres_costs, int bipred_weight, int mb_y, ; int width, int height, int stride, int list_mask ); ;----------------------------------------------------------------------------- INIT_ZMM avx512 cglobal mbtree_propagate_list_internal, 5,7,21 mova xm16, [pw_0xc000] vpbroadcastw xm17, r5m ; bipred_weight << 9 vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT) vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf] vbroadcasti32x8 m6, [pd_0123] vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y vbroadcasti128 m7, [pd_8] vbroadcasti128 m8, [pw_31] vbroadcasti128 m9, [pw_32] psllw m10, m9, 4 pcmpeqw ym19, ym19 ; pw_m1 vpbroadcastw ym20, r7m ; width psrld m11, m7, 3 ; pd_1 psrld m12, m8, 16 ; pd_31 vpbroadcastd m13, r8m ; height vpbroadcastd m14, r9m ; stride pslld m15, m14, 16 por m15, m11 ; {1, stride, 1, stride} ... lea r4, [r4+2*r0] ; lowres_costs lea r3, [r3+2*r0] ; propagate_amount lea r2, [r2+4*r0] ; mvs neg r0 mov r6d, 0x5555ffff kmovd k4, r6d kshiftrd k5, k4, 16 ; 0x5555 kshiftlw k6, k4, 8 ; 0xff00 .loop: vbroadcasti128 ym1, [r4+2*r0] mova xm4, [r3+2*r0] vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3) vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 vptestmw k1, ym1, ym18 vpermw m4, m5, m4 vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy} psraw m0, m3, 5 paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y} paddd m6, m7 ; i_mb_x += 8 pand m3, m8 ; {x, y} vprold m1, m3, 20 ; {y, x} << 4 vpsubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y} vpsubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4 pmullw m3, m1 paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000) pmulhrsw m2, m3, m4 ; idx01weight idx23weightp pslld ym1, ym0, 16 psubw ym1, ym19 vmovdqu16 ym1 {k5}, ym0 vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width kunpckwd k2, k2, k2 psrad m1, m0, 16 vpaddd m1 {k6}, m11 vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height pmaddwd m0, m15 vpaddd m0 {k6}, m14 ; idx0 | idx2 vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes ; We're handling dwords, but the offsets are in words so there may be partial overlaps. ; We can work around this by handling dword-aligned and -unaligned offsets separately. vptestmd k0, m0, m11 kandnw k2, k0, k1 ; dword-aligned offsets kmovw k3, k2 vpgatherdd m3 {k2}, [r1+2*m0] ; If there are conflicts in the offsets we have to handle them before storing the results. ; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel ; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets. vpconflictd m4, m0 vpbroadcastmw2d m1, k1 vptestmd k2, m1, m4 ktestw k2, k2 jz .no_conflicts pand m1, m4 ; mask away unused offsets to avoid false positives vplzcntd m1, m1 pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb .conflict_loop: vpermd m4 {k2}{z}, m1, m2 vpermd m1 {k2}, m1, m1 ; shift the index one step forward paddsw m2, m4 ; add the weights of conflicting offsets vpcmpd k2, m1, m12, 2 ktestw k2, k2 jnz .conflict_loop .no_conflicts: paddsw m3, m2 vpscatterdd [r1+2*m0] {k3}, m3 kandw k1, k0, k1 ; dword-unaligned offsets kmovw k2, k1 vpgatherdd m1 {k1}, [r1+2*m0] paddsw m1, m2 ; all conflicts have already been resolved vpscatterdd [r1+2*m0] {k2}, m1 add r0, 8 jl .loop RET %endif %macro MBTREE_FIX8 0 ;----------------------------------------------------------------------------- ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) ;----------------------------------------------------------------------------- cglobal mbtree_fix8_pack, 3,4 %if mmsize == 32 vbroadcastf128 m2, [pf_256] vbroadcasti128 m3, [mbtree_fix8_pack_shuf] %else movaps m2, [pf_256] mova m3, [mbtree_fix8_pack_shuf] %endif sub r2d, mmsize/2 movsxdifnidn r2, r2d lea r1, [r1+4*r2] lea r0, [r0+2*r2] neg r2 jg .skip_loop .loop: mulps m0, m2, [r1+4*r2] mulps m1, m2, [r1+4*r2+mmsize] cvttps2dq m0, m0 cvttps2dq m1, m1 packssdw m0, m1 pshufb m0, m3 %if mmsize == 32 vpermq m0, m0, q3120 %endif mova [r0+2*r2], m0 add r2, mmsize/2 jle .loop .skip_loop: sub r2, mmsize/2 jz .end ; Do the remaining values in scalar in order to avoid overreading src. .scalar: mulss xm0, xm2, [r1+4*r2+2*mmsize] cvttss2si r3d, xm0 rol r3w, 8 mov [r0+2*r2+mmsize], r3w inc r2 jl .scalar .end: RET ;----------------------------------------------------------------------------- ; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count ) ;----------------------------------------------------------------------------- cglobal mbtree_fix8_unpack, 3,4 %if mmsize == 32 vbroadcastf128 m2, [pf_inv16777216] %else movaps m2, [pf_inv16777216] mova m4, [mbtree_fix8_unpack_shuf+16] %endif mova m3, [mbtree_fix8_unpack_shuf] sub r2d, mmsize/2 movsxdifnidn r2, r2d lea r1, [r1+2*r2] lea r0, [r0+4*r2] neg r2 jg .skip_loop .loop: %if mmsize == 32 vbroadcasti128 m0, [r1+2*r2] vbroadcasti128 m1, [r1+2*r2+16] pshufb m0, m3 pshufb m1, m3 %else mova m1, [r1+2*r2] pshufb m0, m1, m3 pshufb m1, m4 %endif cvtdq2ps m0, m0 cvtdq2ps m1, m1 mulps m0, m2 mulps m1, m2 movaps [r0+4*r2], m0 movaps [r0+4*r2+mmsize], m1 add r2, mmsize/2 jle .loop .skip_loop: sub r2, mmsize/2 jz .end .scalar: movzx r3d, word [r1+2*r2+mmsize] bswap r3d ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design. cvtsi2ss xm0, xm2, r3d mulss xm0, xm2 movss [r0+4*r2+2*mmsize], xm0 inc r2 jl .scalar .end: RET %endmacro INIT_XMM ssse3 MBTREE_FIX8 INIT_YMM avx2 MBTREE_FIX8 %macro MBTREE_FIX8_AVX512_END 0 add r2, mmsize/2 jle .loop cmp r2d, mmsize/2 jl .tail RET .tail: ; Do the final loop iteration with partial masking to handle the remaining elements. shrx r3d, r3d, r2d ; (1 << count) - 1 kmovd k1, r3d kshiftrd k2, k1, 16 jmp .loop %endmacro INIT_ZMM avx512 cglobal mbtree_fix8_pack, 3,4 vbroadcastf32x4 m2, [pf_256] vbroadcasti32x4 m3, [mbtree_fix8_pack_shuf] psrld xm4, xm3, 4 pmovzxbq m4, xm4 sub r2d, mmsize/2 mov r3d, -1 movsxdifnidn r2, r2d lea r1, [r1+4*r2] lea r0, [r0+2*r2] neg r2 jg .tail kmovd k1, r3d kmovw k2, k1 .loop: vmulps m0 {k1}{z}, m2, [r1+4*r2] vmulps m1 {k2}{z}, m2, [r1+4*r2+mmsize] cvttps2dq m0, m0 cvttps2dq m1, m1 packssdw m0, m1 pshufb m0, m3 vpermq m0, m4, m0 vmovdqu16 [r0+2*r2] {k1}, m0 MBTREE_FIX8_AVX512_END cglobal mbtree_fix8_unpack, 3,4 vbroadcasti32x8 m3, [mbtree_fix8_unpack_shuf] vbroadcastf32x4 m2, [pf_inv16777216] sub r2d, mmsize/2 mov r3d, -1 movsxdifnidn r2, r2d lea r1, [r1+2*r2] lea r0, [r0+4*r2] neg r2 jg .tail kmovw k1, r3d kmovw k2, k1 .loop: mova m1, [r1+2*r2] vshufi32x4 m0, m1, m1, q1100 vshufi32x4 m1, m1, m1, q3322 pshufb m0, m3 pshufb m1, m3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 mulps m0, m2 mulps m1, m2 vmovaps [r0+4*r2] {k1}, m0 vmovaps [r0+4*r2+mmsize] {k2}, m1 MBTREE_FIX8_AVX512_END x264-master/common/x86/mc-c.c000066400000000000000000001533551502133446700157730ustar00rootroot00000000000000/***************************************************************************** * mc-c.c: x86 motion compensation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "mc.h" #define x264_pixel_avg_16x16_avx2 x264_template(pixel_avg_16x16_avx2) #define x264_pixel_avg_16x16_avx512 x264_template(pixel_avg_16x16_avx512) #define x264_pixel_avg_16x16_mmx2 x264_template(pixel_avg_16x16_mmx2) #define x264_pixel_avg_16x16_sse2 x264_template(pixel_avg_16x16_sse2) #define x264_pixel_avg_16x16_ssse3 x264_template(pixel_avg_16x16_ssse3) #define x264_pixel_avg_16x8_avx2 x264_template(pixel_avg_16x8_avx2) #define x264_pixel_avg_16x8_avx512 x264_template(pixel_avg_16x8_avx512) #define x264_pixel_avg_16x8_mmx2 x264_template(pixel_avg_16x8_mmx2) #define x264_pixel_avg_16x8_sse2 x264_template(pixel_avg_16x8_sse2) #define x264_pixel_avg_16x8_ssse3 x264_template(pixel_avg_16x8_ssse3) #define x264_pixel_avg_4x16_mmx2 x264_template(pixel_avg_4x16_mmx2) #define x264_pixel_avg_4x16_sse2 x264_template(pixel_avg_4x16_sse2) #define x264_pixel_avg_4x16_ssse3 x264_template(pixel_avg_4x16_ssse3) #define x264_pixel_avg_4x2_mmx2 x264_template(pixel_avg_4x2_mmx2) #define x264_pixel_avg_4x2_sse2 x264_template(pixel_avg_4x2_sse2) #define x264_pixel_avg_4x2_ssse3 x264_template(pixel_avg_4x2_ssse3) #define x264_pixel_avg_4x4_mmx2 x264_template(pixel_avg_4x4_mmx2) #define x264_pixel_avg_4x4_sse2 x264_template(pixel_avg_4x4_sse2) #define x264_pixel_avg_4x4_ssse3 x264_template(pixel_avg_4x4_ssse3) #define x264_pixel_avg_4x8_mmx2 x264_template(pixel_avg_4x8_mmx2) #define x264_pixel_avg_4x8_sse2 x264_template(pixel_avg_4x8_sse2) #define x264_pixel_avg_4x8_ssse3 x264_template(pixel_avg_4x8_ssse3) #define x264_pixel_avg_8x16_avx512 x264_template(pixel_avg_8x16_avx512) #define x264_pixel_avg_8x16_mmx2 x264_template(pixel_avg_8x16_mmx2) #define x264_pixel_avg_8x16_sse2 x264_template(pixel_avg_8x16_sse2) #define x264_pixel_avg_8x16_ssse3 x264_template(pixel_avg_8x16_ssse3) #define x264_pixel_avg_8x4_avx512 x264_template(pixel_avg_8x4_avx512) #define x264_pixel_avg_8x4_mmx2 x264_template(pixel_avg_8x4_mmx2) #define x264_pixel_avg_8x4_sse2 x264_template(pixel_avg_8x4_sse2) #define x264_pixel_avg_8x4_ssse3 x264_template(pixel_avg_8x4_ssse3) #define x264_pixel_avg_8x8_avx512 x264_template(pixel_avg_8x8_avx512) #define x264_pixel_avg_8x8_mmx2 x264_template(pixel_avg_8x8_mmx2) #define x264_pixel_avg_8x8_sse2 x264_template(pixel_avg_8x8_sse2) #define x264_pixel_avg_8x8_ssse3 x264_template(pixel_avg_8x8_ssse3) #define DECL_SUF( func, args )\ void func##_mmx2 args;\ void func##_sse2 args;\ void func##_ssse3 args;\ void func##_avx2 args;\ void func##_avx512 args; DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_8x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_8x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_8x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_4x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_4x2, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) #undef DECL_SUF #define x264_mc_weight_w12_mmx2 x264_template(mc_weight_w12_mmx2) #define x264_mc_weight_w12_sse2 x264_template(mc_weight_w12_sse2) #define x264_mc_weight_w16_avx2 x264_template(mc_weight_w16_avx2) #define x264_mc_weight_w16_mmx2 x264_template(mc_weight_w16_mmx2) #define x264_mc_weight_w16_sse2 x264_template(mc_weight_w16_sse2) #define x264_mc_weight_w16_ssse3 x264_template(mc_weight_w16_ssse3) #define x264_mc_weight_w20_avx2 x264_template(mc_weight_w20_avx2) #define x264_mc_weight_w20_mmx2 x264_template(mc_weight_w20_mmx2) #define x264_mc_weight_w20_sse2 x264_template(mc_weight_w20_sse2) #define x264_mc_weight_w20_ssse3 x264_template(mc_weight_w20_ssse3) #define x264_mc_weight_w4_mmx2 x264_template(mc_weight_w4_mmx2) #define x264_mc_weight_w4_ssse3 x264_template(mc_weight_w4_ssse3) #define x264_mc_weight_w8_avx2 x264_template(mc_weight_w8_avx2) #define x264_mc_weight_w8_mmx2 x264_template(mc_weight_w8_mmx2) #define x264_mc_weight_w8_sse2 x264_template(mc_weight_w8_sse2) #define x264_mc_weight_w8_ssse3 x264_template(mc_weight_w8_ssse3) #define MC_WEIGHT(w,type) \ void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define x264_mc_offsetadd_w12_mmx2 x264_template(mc_offsetadd_w12_mmx2) #define x264_mc_offsetadd_w16_mmx2 x264_template(mc_offsetadd_w16_mmx2) #define x264_mc_offsetadd_w16_sse2 x264_template(mc_offsetadd_w16_sse2) #define x264_mc_offsetadd_w20_mmx2 x264_template(mc_offsetadd_w20_mmx2) #define x264_mc_offsetadd_w20_sse2 x264_template(mc_offsetadd_w20_sse2) #define x264_mc_offsetadd_w4_mmx2 x264_template(mc_offsetadd_w4_mmx2) #define x264_mc_offsetadd_w8_mmx2 x264_template(mc_offsetadd_w8_mmx2) #define x264_mc_offsetadd_w8_sse2 x264_template(mc_offsetadd_w8_sse2) #define x264_mc_offsetsub_w12_mmx2 x264_template(mc_offsetsub_w12_mmx2) #define x264_mc_offsetsub_w16_mmx2 x264_template(mc_offsetsub_w16_mmx2) #define x264_mc_offsetsub_w16_sse2 x264_template(mc_offsetsub_w16_sse2) #define x264_mc_offsetsub_w20_mmx2 x264_template(mc_offsetsub_w20_mmx2) #define x264_mc_offsetsub_w20_sse2 x264_template(mc_offsetsub_w20_sse2) #define x264_mc_offsetsub_w4_mmx2 x264_template(mc_offsetsub_w4_mmx2) #define x264_mc_offsetsub_w8_mmx2 x264_template(mc_offsetsub_w8_mmx2) #define x264_mc_offsetsub_w8_sse2 x264_template(mc_offsetsub_w8_sse2) #define MC_WEIGHT_OFFSET(w,type) \ void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \ void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \ MC_WEIGHT(w,type) MC_WEIGHT_OFFSET( 4, mmx2 ) MC_WEIGHT_OFFSET( 8, mmx2 ) MC_WEIGHT_OFFSET( 12, mmx2 ) MC_WEIGHT_OFFSET( 16, mmx2 ) MC_WEIGHT_OFFSET( 20, mmx2 ) MC_WEIGHT_OFFSET( 12, sse2 ) MC_WEIGHT_OFFSET( 16, sse2 ) MC_WEIGHT_OFFSET( 20, sse2 ) #if HIGH_BIT_DEPTH MC_WEIGHT_OFFSET( 8, sse2 ) #endif MC_WEIGHT( 8, sse2 ) MC_WEIGHT( 4, ssse3 ) MC_WEIGHT( 8, ssse3 ) MC_WEIGHT( 12, ssse3 ) MC_WEIGHT( 16, ssse3 ) MC_WEIGHT( 20, ssse3 ) MC_WEIGHT( 8, avx2 ) MC_WEIGHT( 16, avx2 ) MC_WEIGHT( 20, avx2 ) #undef MC_WEIGHT_OFFSET #undef MC_WEIGHT #define x264_mc_copy_w4_mmx x264_template(mc_copy_w4_mmx) void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w8_mmx x264_template(mc_copy_w8_mmx) void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w8_sse x264_template(mc_copy_w8_sse) void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w16_mmx x264_template(mc_copy_w16_mmx) void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w16_sse x264_template(mc_copy_w16_sse) void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w16_aligned_sse x264_template(mc_copy_w16_aligned_sse) void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w16_avx x264_template(mc_copy_w16_avx) void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int ); #define x264_mc_copy_w16_aligned_avx x264_template(mc_copy_w16_aligned_avx) void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int ); #define x264_prefetch_fenc_400_mmx2 x264_template(prefetch_fenc_400_mmx2) void x264_prefetch_fenc_400_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_prefetch_fenc_420_mmx2 x264_template(prefetch_fenc_420_mmx2) void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_prefetch_fenc_422_mmx2 x264_template(prefetch_fenc_422_mmx2) void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_prefetch_ref_mmx2 x264_template(prefetch_ref_mmx2) void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); #define x264_plane_copy_core_sse x264_template(plane_copy_core_sse) void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define x264_plane_copy_core_avx x264_template(plane_copy_core_avx) void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define x264_plane_copy_avx512 x264_template(plane_copy_avx512) void x264_plane_copy_avx512( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define x264_plane_copy_swap_core_ssse3 x264_template(plane_copy_swap_core_ssse3) void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define x264_plane_copy_swap_core_avx2 x264_template(plane_copy_swap_core_avx2) void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define x264_plane_copy_swap_avx512 x264_template(plane_copy_swap_avx512) void x264_plane_copy_swap_avx512( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); #define x264_plane_copy_interleave_core_mmx2 x264_template(plane_copy_interleave_core_mmx2) void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_plane_copy_interleave_core_sse2 x264_template(plane_copy_interleave_core_sse2) void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_plane_copy_interleave_core_avx x264_template(plane_copy_interleave_core_avx) void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); #define x264_plane_copy_deinterleave_sse2 x264_template(plane_copy_deinterleave_sse2) void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_ssse3 x264_template(plane_copy_deinterleave_ssse3) void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta, uint8_t *dstb, intptr_t i_dstb, uint8_t *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_avx x264_template(plane_copy_deinterleave_avx) void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta, uint16_t *dstb, intptr_t i_dstb, uint16_t *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_avx2 x264_template(plane_copy_deinterleave_avx2) void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_rgb_sse2 x264_template(plane_copy_deinterleave_rgb_sse2) void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); #define x264_plane_copy_deinterleave_rgb_ssse3 x264_template(plane_copy_deinterleave_rgb_ssse3) void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); #define x264_plane_copy_deinterleave_rgb_avx2 x264_template(plane_copy_deinterleave_rgb_avx2) void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); #define x264_plane_copy_deinterleave_v210_ssse3 x264_template(plane_copy_deinterleave_v210_ssse3) void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint32_t *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_v210_avx x264_template(plane_copy_deinterleave_v210_avx) void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint32_t *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_v210_avx2 x264_template(plane_copy_deinterleave_v210_avx2) void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint32_t *src, intptr_t i_src, int w, int h ); #define x264_plane_copy_deinterleave_v210_avx512 x264_template(plane_copy_deinterleave_v210_avx512) void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint32_t *src, intptr_t i_src, int w, int h ); #define x264_store_interleave_chroma_mmx2 x264_template(store_interleave_chroma_mmx2) void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); #define x264_store_interleave_chroma_sse2 x264_template(store_interleave_chroma_sse2) void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); #define x264_store_interleave_chroma_avx x264_template(store_interleave_chroma_avx) void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); #define x264_load_deinterleave_chroma_fenc_sse2 x264_template(load_deinterleave_chroma_fenc_sse2) void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_ssse3 x264_template(load_deinterleave_chroma_fenc_ssse3) void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_avx x264_template(load_deinterleave_chroma_fenc_avx) void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2) void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512) void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2) void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3) void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_avx x264_template(load_deinterleave_chroma_fdec_avx) void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_avx2 x264_template(load_deinterleave_chroma_fdec_avx2) void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_avx512 x264_template(load_deinterleave_chroma_fdec_avx512) void x264_load_deinterleave_chroma_fdec_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); #define x264_memcpy_aligned_sse x264_template(memcpy_aligned_sse) void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n ); #define x264_memcpy_aligned_avx x264_template(memcpy_aligned_avx) void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n ); #define x264_memcpy_aligned_avx512 x264_template(memcpy_aligned_avx512) void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n ); #define x264_memzero_aligned_sse x264_template(memzero_aligned_sse) void x264_memzero_aligned_sse ( void *dst, size_t n ); #define x264_memzero_aligned_avx x264_template(memzero_aligned_avx) void x264_memzero_aligned_avx ( void *dst, size_t n ); #define x264_memzero_aligned_avx512 x264_template(memzero_aligned_avx512) void x264_memzero_aligned_avx512( void *dst, size_t n ); #define x264_integral_init4h_sse4 x264_template(integral_init4h_sse4) void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); #define x264_integral_init4h_avx2 x264_template(integral_init4h_avx2) void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); #define x264_integral_init8h_sse4 x264_template(integral_init8h_sse4) void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); #define x264_integral_init8h_avx x264_template(integral_init8h_avx) void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride ); #define x264_integral_init8h_avx2 x264_template(integral_init8h_avx2) void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); #define x264_integral_init4v_mmx x264_template(integral_init4v_mmx) void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); #define x264_integral_init4v_sse2 x264_template(integral_init4v_sse2) void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); #define x264_integral_init4v_ssse3 x264_template(integral_init4v_ssse3) void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); #define x264_integral_init4v_avx2 x264_template(integral_init4v_avx2) void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); #define x264_integral_init8v_mmx x264_template(integral_init8v_mmx) void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); #define x264_integral_init8v_sse2 x264_template(integral_init8v_sse2) void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); #define x264_integral_init8v_avx2 x264_template(integral_init8v_avx2) void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); #define x264_mbtree_propagate_cost_sse2 x264_template(mbtree_propagate_cost_sse2) void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define x264_mbtree_propagate_cost_avx x264_template(mbtree_propagate_cost_avx) void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define x264_mbtree_propagate_cost_fma4 x264_template(mbtree_propagate_cost_fma4) void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define x264_mbtree_propagate_cost_avx2 x264_template(mbtree_propagate_cost_avx2) void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define x264_mbtree_propagate_cost_avx512 x264_template(mbtree_propagate_cost_avx512) void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define x264_mbtree_fix8_pack_ssse3 x264_template(mbtree_fix8_pack_ssse3) void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count ); #define x264_mbtree_fix8_pack_avx2 x264_template(mbtree_fix8_pack_avx2) void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count ); #define x264_mbtree_fix8_pack_avx512 x264_template(mbtree_fix8_pack_avx512) void x264_mbtree_fix8_pack_avx512( uint16_t *dst, float *src, int count ); #define x264_mbtree_fix8_unpack_ssse3 x264_template(mbtree_fix8_unpack_ssse3) void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count ); #define x264_mbtree_fix8_unpack_avx2 x264_template(mbtree_fix8_unpack_avx2) void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count ); #define x264_mbtree_fix8_unpack_avx512 x264_template(mbtree_fix8_unpack_avx512) void x264_mbtree_fix8_unpack_avx512( float *dst, uint16_t *src, int count ); #define x264_mc_chroma_avx x264_template(mc_chroma_avx) #define x264_mc_chroma_avx2 x264_template(mc_chroma_avx2) #define x264_mc_chroma_cache64_ssse3 x264_template(mc_chroma_cache64_ssse3) #define x264_mc_chroma_mmx2 x264_template(mc_chroma_mmx2) #define x264_mc_chroma_sse2 x264_template(mc_chroma_sse2) #define x264_mc_chroma_ssse3 x264_template(mc_chroma_ssse3) #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\ int dx, int dy, int i_width, int i_height ); MC_CHROMA(mmx2) MC_CHROMA(sse2) MC_CHROMA(ssse3) MC_CHROMA(cache64_ssse3) MC_CHROMA(avx) MC_CHROMA(avx2) #undef MC_CHROMA #define x264_frame_init_lowres_core_avx x264_template(frame_init_lowres_core_avx) #define x264_frame_init_lowres_core_avx2 x264_template(frame_init_lowres_core_avx2) #define x264_frame_init_lowres_core_mmx2 x264_template(frame_init_lowres_core_mmx2) #define x264_frame_init_lowres_core_cache32_mmx2 x264_template(frame_init_lowres_core_cache32_mmx2) #define x264_frame_init_lowres_core_sse2 x264_template(frame_init_lowres_core_sse2) #define x264_frame_init_lowres_core_ssse3 x264_template(frame_init_lowres_core_ssse3) #define x264_frame_init_lowres_core_xop x264_template(frame_init_lowres_core_xop) #define LOWRES(cpu)\ void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\ intptr_t src_stride, intptr_t dst_stride, int width, int height ); LOWRES(mmx2) LOWRES(cache32_mmx2) LOWRES(sse2) LOWRES(ssse3) LOWRES(avx) LOWRES(xop) LOWRES(avx2) #undef LOWRES #define x264_pixel_avg2_w10_mmx2 x264_template(pixel_avg2_w10_mmx2) #define x264_pixel_avg2_w10_sse2 x264_template(pixel_avg2_w10_sse2) #define x264_pixel_avg2_w12_cache32_mmx2 x264_template(pixel_avg2_w12_cache32_mmx2) #define x264_pixel_avg2_w12_cache64_mmx2 x264_template(pixel_avg2_w12_cache64_mmx2) #define x264_pixel_avg2_w12_mmx2 x264_template(pixel_avg2_w12_mmx2) #define x264_pixel_avg2_w16_avx2 x264_template(pixel_avg2_w16_avx2) #define x264_pixel_avg2_w16_cache32_mmx2 x264_template(pixel_avg2_w16_cache32_mmx2) #define x264_pixel_avg2_w16_cache64_mmx2 x264_template(pixel_avg2_w16_cache64_mmx2) #define x264_pixel_avg2_w16_cache64_sse2 x264_template(pixel_avg2_w16_cache64_sse2) #define x264_pixel_avg2_w16_cache64_ssse3 x264_template(pixel_avg2_w16_cache64_ssse3) #define x264_pixel_avg2_w16_mmx2 x264_template(pixel_avg2_w16_mmx2) #define x264_pixel_avg2_w16_sse2 x264_template(pixel_avg2_w16_sse2) #define x264_pixel_avg2_w18_avx2 x264_template(pixel_avg2_w18_avx2) #define x264_pixel_avg2_w18_mmx2 x264_template(pixel_avg2_w18_mmx2) #define x264_pixel_avg2_w18_sse2 x264_template(pixel_avg2_w18_sse2) #define x264_pixel_avg2_w20_avx2 x264_template(pixel_avg2_w20_avx2) #define x264_pixel_avg2_w20_cache32_mmx2 x264_template(pixel_avg2_w20_cache32_mmx2) #define x264_pixel_avg2_w20_cache64_mmx2 x264_template(pixel_avg2_w20_cache64_mmx2) #define x264_pixel_avg2_w20_cache64_sse2 x264_template(pixel_avg2_w20_cache64_sse2) #define x264_pixel_avg2_w20_mmx2 x264_template(pixel_avg2_w20_mmx2) #define x264_pixel_avg2_w20_sse2 x264_template(pixel_avg2_w20_sse2) #define x264_pixel_avg2_w4_mmx2 x264_template(pixel_avg2_w4_mmx2) #define x264_pixel_avg2_w8_cache32_mmx2 x264_template(pixel_avg2_w8_cache32_mmx2) #define x264_pixel_avg2_w8_cache64_mmx2 x264_template(pixel_avg2_w8_cache64_mmx2) #define x264_pixel_avg2_w8_mmx2 x264_template(pixel_avg2_w8_mmx2) #define x264_pixel_avg2_w8_sse2 x264_template(pixel_avg2_w8_sse2) #define PIXEL_AVG_W(width,cpu)\ void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ); /* This declares some functions that don't exist, but that isn't a problem. */ #define PIXEL_AVG_WALL(cpu)\ PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu); PIXEL_AVG_WALL(mmx2) PIXEL_AVG_WALL(cache32_mmx2) PIXEL_AVG_WALL(cache64_mmx2) PIXEL_AVG_WALL(cache64_sse2) PIXEL_AVG_WALL(sse2) PIXEL_AVG_WALL(cache64_ssse3) PIXEL_AVG_WALL(avx2) #undef PIXEL_AVG_W #undef PIXEL_AVG_WALL #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\ {\ NULL,\ x264_pixel_avg2_w4_##name1,\ x264_pixel_avg2_w8_##name2,\ x264_pixel_avg2_w12_##name3,\ x264_pixel_avg2_w16_##name4,\ x264_pixel_avg2_w20_##name5,\ }; #if HIGH_BIT_DEPTH /* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */ #undef x264_pixel_avg2_w12_mmx2 #undef x264_pixel_avg2_w20_mmx2 #undef x264_pixel_avg2_w20_sse2 #undef x264_pixel_avg2_w20_avx2 #define x264_pixel_avg2_w12_mmx2 x264_pixel_avg2_w10_mmx2 #define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2 #define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2 #define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2 #define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2 #define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2 #else /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */ #define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3 #define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2 #define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3 #define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2 #endif // HIGH_BIT_DEPTH PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2) #if HIGH_BIT_DEPTH PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2) PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2) #else // !HIGH_BIT_DEPTH #if ARCH_X86 PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2) PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2) #endif PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2) PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2) PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2) PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2) PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2) #endif // HIGH_BIT_DEPTH #define MC_COPY_WTAB(instr, name1, name2, name3)\ static void (* const mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\ {\ NULL,\ x264_mc_copy_w4_##name1,\ x264_mc_copy_w8_##name2,\ NULL,\ x264_mc_copy_w16_##name3,\ }; MC_COPY_WTAB(mmx,mmx,mmx,mmx) #if HIGH_BIT_DEPTH MC_COPY_WTAB(sse,mmx,sse,sse) MC_COPY_WTAB(avx,mmx,sse,avx) #else MC_COPY_WTAB(sse,mmx,mmx,sse) #endif #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\ static void (* mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\ {\ x264_mc_##function##_w4_##name1,\ x264_mc_##function##_w4_##name1,\ x264_mc_##function##_w8_##name2,\ x264_mc_##function##_w##w12version##_##instr,\ x264_mc_##function##_w16_##instr,\ x264_mc_##function##_w20_##instr,\ }; #if HIGH_BIT_DEPTH MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12) MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12) MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12) MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12) MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16) MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16) static void weight_cache_mmx2( x264_t *h, x264_weight_t *w ) { if( w->i_scale == 1<i_denom ) { if( w->i_offset < 0 ) w->weightfn = h->mc.offsetsub; else w->weightfn = h->mc.offsetadd; for( int i = 0; i < 8; i++ ) w->cachea[i] = abs(w->i_offset * (1 << (BIT_DEPTH-8))); return; } w->weightfn = h->mc.weight; int den1 = 1<i_denom; int den2 = w->i_scale<<1; int den3 = 1+(w->i_offset * (1 << (BIT_DEPTH-8+1))); for( int i = 0; i < 8; i++ ) { w->cachea[i] = den1; w->cacheb[i] = i&1 ? den3 : den2; } } #else MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12) MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12) MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12) MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16) MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16) MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16) MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16) MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16) static void weight_cache_mmx2( x264_t *h, x264_weight_t *w ) { int i; int16_t den1; if( w->i_scale == 1<i_denom ) { if( w->i_offset < 0 ) w->weightfn = h->mc.offsetsub; else w->weightfn = h->mc.offsetadd; memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) ); return; } w->weightfn = h->mc.weight; den1 = (w->i_offset * (1<i_denom)) | (w->i_denom ? 1 << (w->i_denom - 1) : 0); for( i = 0; i < 8; i++ ) { w->cachea[i] = w->i_scale; w->cacheb[i] = den1; } } static void weight_cache_ssse3( x264_t *h, x264_weight_t *w ) { int i, den1; if( w->i_scale == 1<i_denom ) { if( w->i_offset < 0 ) w->weightfn = h->mc.offsetsub; else w->weightfn = h->mc.offsetadd; memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) ); return; } w->weightfn = h->mc.weight; den1 = w->i_scale << (8 - w->i_denom); for( i = 0; i < 8; i++ ) { w->cachea[i] = den1; w->cacheb[i] = w->i_offset; } } #endif // !HIGH_BIT_DEPTH #define MC_LUMA(name,instr1,instr2)\ static void mc_luma_##name( pixel *dst, intptr_t i_dst_stride,\ pixel *src[4], intptr_t i_src_stride,\ int mvx, int mvy,\ int i_width, int i_height, const x264_weight_t *weight )\ {\ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ if( qpel_idx & 5 ) /* qpel interpolation needed */\ {\ pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ pixel_avg_wtab_##instr1[i_width>>2](\ dst, i_dst_stride, src1, i_src_stride,\ src2, i_height );\ if( weight->weightfn )\ weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\ }\ else if( weight->weightfn )\ weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\ else\ mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\ } MC_LUMA(mmx2,mmx2,mmx) MC_LUMA(sse2,sse2,sse) #if HIGH_BIT_DEPTH MC_LUMA(avx2,avx2,avx) #else #if ARCH_X86 MC_LUMA(cache32_mmx2,cache32_mmx2,mmx) MC_LUMA(cache64_mmx2,cache64_mmx2,mmx) #endif MC_LUMA(cache64_sse2,cache64_sse2,sse) MC_LUMA(cache64_ssse3,cache64_ssse3,sse) MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse) #endif // !HIGH_BIT_DEPTH #define GET_REF(name)\ static pixel *get_ref_##name( pixel *dst, intptr_t *i_dst_stride,\ pixel *src[4], intptr_t i_src_stride,\ int mvx, int mvy,\ int i_width, int i_height, const x264_weight_t *weight )\ {\ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ if( qpel_idx & 5 ) /* qpel interpolation needed */\ {\ pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ pixel_avg_wtab_##name[i_width>>2](\ dst, *i_dst_stride, src1, i_src_stride,\ src2, i_height );\ if( weight->weightfn )\ weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\ return dst;\ }\ else if( weight->weightfn )\ {\ weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\ return dst;\ }\ else\ {\ *i_dst_stride = i_src_stride;\ return src1;\ }\ } GET_REF(mmx2) GET_REF(sse2) GET_REF(avx2) #if !HIGH_BIT_DEPTH #if ARCH_X86 GET_REF(cache32_mmx2) GET_REF(cache64_mmx2) #endif GET_REF(cache64_sse2) GET_REF(cache64_ssse3) GET_REF(cache64_ssse3_atom) #endif // !HIGH_BIT_DEPTH #define x264_hpel_filter_avx x264_template(hpel_filter_avx) #define x264_hpel_filter_avx2 x264_template(hpel_filter_avx2) #define x264_hpel_filter_c_mmx2 x264_template(hpel_filter_c_mmx2) #define x264_hpel_filter_c_sse2 x264_template(hpel_filter_c_sse2) #define x264_hpel_filter_c_ssse3 x264_template(hpel_filter_c_ssse3) #define x264_hpel_filter_c_avx x264_template(hpel_filter_c_avx) #define x264_hpel_filter_c_avx2 x264_template(hpel_filter_c_avx2) #define x264_hpel_filter_h_mmx2 x264_template(hpel_filter_h_mmx2) #define x264_hpel_filter_h_sse2 x264_template(hpel_filter_h_sse2) #define x264_hpel_filter_h_ssse3 x264_template(hpel_filter_h_ssse3) #define x264_hpel_filter_h_avx x264_template(hpel_filter_h_avx) #define x264_hpel_filter_h_avx2 x264_template(hpel_filter_h_avx2) #define x264_hpel_filter_sse2 x264_template(hpel_filter_sse2) #define x264_hpel_filter_ssse3 x264_template(hpel_filter_ssse3) #define x264_hpel_filter_v_mmx2 x264_template(hpel_filter_v_mmx2) #define x264_hpel_filter_v_sse2 x264_template(hpel_filter_v_sse2) #define x264_hpel_filter_v_ssse3 x264_template(hpel_filter_v_ssse3) #define x264_hpel_filter_v_avx x264_template(hpel_filter_v_avx) #define x264_hpel_filter_v_avx2 x264_template(hpel_filter_v_avx2) #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\ void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\ void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\ static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\ intptr_t stride, int width, int height, int16_t *buf )\ {\ intptr_t realign = (intptr_t)src & (align-1);\ src -= realign;\ dstv -= realign;\ dstc -= realign;\ dsth -= realign;\ width += realign;\ while( height-- )\ {\ x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\ x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\ x264_hpel_filter_h_##cpuh( dsth, src, width );\ dsth += stride;\ dstv += stride;\ dstc += stride;\ src += stride;\ }\ x264_sfence();\ } HPEL(8, mmx2, mmx2, mmx2, mmx2) #if HIGH_BIT_DEPTH HPEL(16, sse2, sse2, sse2, sse2) #else // !HIGH_BIT_DEPTH HPEL(16, sse2_amd, mmx2, mmx2, sse2) #if ARCH_X86_64 void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); #else HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, ssse3, ssse3, ssse3, ssse3) HPEL(16, avx, avx, avx, avx) HPEL(32, avx2, avx2, avx2, avx2) #endif #endif // HIGH_BIT_DEPTH PLANE_COPY(16, sse) PLANE_COPY(32, avx) PLANE_COPY_SWAP(16, ssse3) PLANE_COPY_SWAP(32, avx2) #if HIGH_BIT_DEPTH PLANE_COPY_YUYV(64, sse2) PLANE_COPY_YUYV(64, avx) #else PLANE_COPY_YUYV(32, sse2) PLANE_COPY_YUYV(32, ssse3) #endif PLANE_COPY_YUYV(64, avx2) PLANE_INTERLEAVE(mmx2) PLANE_INTERLEAVE(sse2) #if HIGH_BIT_DEPTH PLANE_INTERLEAVE(avx) #endif #if HAVE_X86_INLINE_ASM #undef MC_CLIP_ADD #define MC_CLIP_ADD(s,x)\ do\ {\ int temp_s = s;\ int temp_x = x;\ asm("movd %0, %%xmm0 \n"\ "movd %1, %%xmm1 \n"\ "paddsw %%xmm1, %%xmm0 \n"\ "movd %%xmm0, %0 \n"\ :"+&r"(temp_s)\ :"r"(temp_x)\ :"xmm0", "xmm1"\ );\ s = temp_s;\ } while( 0 ) #undef MC_CLIP_ADD2 #define MC_CLIP_ADD2(s,x)\ do\ {\ x264_union32_t temp = { .w={ (s)[0], (s)[1] } };\ asm("movd %0, %%xmm0 \n"\ "movd %1, %%xmm1 \n"\ "paddsw %%xmm1, %%xmm0 \n"\ "movd %%xmm0, %0 \n"\ :"+&r"(temp)\ :"m"(M32(x))\ :"xmm0", "xmm1"\ );\ (s)[0] = temp.w[0];\ (s)[1] = temp.w[1];\ } while( 0 ) #endif #define x264_mbtree_propagate_list_internal_ssse3 x264_template(mbtree_propagate_list_internal_ssse3) PROPAGATE_LIST(ssse3) #define x264_mbtree_propagate_list_internal_avx x264_template(mbtree_propagate_list_internal_avx) PROPAGATE_LIST(avx) #define x264_mbtree_propagate_list_internal_avx2 x264_template(mbtree_propagate_list_internal_avx2) PROPAGATE_LIST(avx2) #if ARCH_X86_64 #define x264_mbtree_propagate_list_internal_avx512 x264_template(mbtree_propagate_list_internal_avx512) void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, int bipred_weight, int mb_y, int width, int height, int stride, int list_mask ); static void mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, int bipred_weight, int mb_y, int len, int list ) { x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9, mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride, (1 << LOWRES_COST_SHIFT) << list ); } #endif void x264_mc_init_mmx( uint32_t cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) return; pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; pf->integral_init4v = x264_integral_init4v_mmx; pf->integral_init8v = x264_integral_init8v_mmx; if( !(cpu&X264_CPU_MMX2) ) return; pf->prefetch_fenc_400 = x264_prefetch_fenc_400_mmx2; pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2; pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2; pf->prefetch_ref = x264_prefetch_ref_mmx2; pf->plane_copy_interleave = plane_copy_interleave_mmx2; pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmx2; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmx2; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmx2; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmx2; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_mmx2; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmx2; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmx2; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmx2; pf->mc_luma = mc_luma_mmx2; pf->get_ref = get_ref_mmx2; pf->mc_chroma = x264_mc_chroma_mmx2; pf->hpel_filter = x264_hpel_filter_mmx2; pf->weight = mc_weight_wtab_mmx2; pf->weight_cache = weight_cache_mmx2; pf->offsetadd = mc_offsetadd_wtab_mmx2; pf->offsetsub = mc_offsetsub_wtab_mmx2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2; if( cpu&X264_CPU_SSE ) { pf->memcpy_aligned = x264_memcpy_aligned_sse; pf->memzero_aligned = x264_memzero_aligned_sse; pf->plane_copy = plane_copy_sse; } #if HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2; #endif if( !(cpu&X264_CPU_SSE2) ) return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; pf->plane_copy_interleave = plane_copy_interleave_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { pf->get_ref = get_ref_sse2; pf->mc_luma = mc_luma_sse2; pf->hpel_filter = x264_hpel_filter_sse2; } pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; pf->offsetadd = mc_offsetadd_wtab_sse2; pf->offsetsub = mc_offsetsub_wtab_sse2; if( cpu&X264_CPU_SSE2_IS_SLOW ) return; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sse2; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sse2; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sse2; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sse2; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse; pf->weight = mc_weight_wtab_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->plane_copy_swap = plane_copy_swap_ssse3; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; pf->mbtree_propagate_list = mbtree_propagate_list_ssse3; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3; if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; if( !(cpu&X264_CPU_AVX) ) return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx; pf->plane_copy_interleave = plane_copy_interleave_avx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx; pf->store_interleave_chroma = x264_store_interleave_chroma_avx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; if( cpu&X264_CPU_XOP ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop; if( cpu&X264_CPU_AVX2 ) { pf->mc_luma = mc_luma_avx2; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2; } if( cpu&X264_CPU_AVX512 ) { pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512; } #else // !HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead if( cpu&X264_CPU_CACHELINE_32 ) { pf->mc_luma = mc_luma_cache32_mmx2; pf->get_ref = get_ref_cache32_mmx2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2; } else if( cpu&X264_CPU_CACHELINE_64 ) { pf->mc_luma = mc_luma_cache64_mmx2; pf->get_ref = get_ref_cache64_mmx2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2; } #endif if( !(cpu&X264_CPU_SSE2) ) return; pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_sse2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2; if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) { pf->weight = mc_weight_wtab_sse2; if( !(cpu&X264_CPU_SLOW_ATOM) ) { pf->offsetadd = mc_offsetadd_wtab_sse2; pf->offsetsub = mc_offsetsub_wtab_sse2; } pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2; pf->hpel_filter = x264_hpel_filter_sse2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium? pf->plane_copy_interleave = plane_copy_interleave_sse2; pf->mc_luma = mc_luma_sse2; pf->get_ref = get_ref_sse2; if( cpu&X264_CPU_CACHELINE_64 ) { pf->mc_luma = mc_luma_cache64_sse2; pf->get_ref = get_ref_cache64_sse2; } } } if( !(cpu&X264_CPU_SSSE3) ) return; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3; pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_ssse3; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; pf->plane_copy_swap = plane_copy_swap_ssse3; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3; pf->mbtree_propagate_list = mbtree_propagate_list_ssse3; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) { pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_ssse3; } if( !(cpu&X264_CPU_SLOW_PALIGNR) ) { #if ARCH_X86_64 if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */ #endif pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; } if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_cache64_ssse3; pf->mc_luma = mc_luma_cache64_ssse3; pf->get_ref = get_ref_cache64_ssse3; if( cpu&X264_CPU_SLOW_ATOM ) { pf->mc_luma = mc_luma_cache64_ssse3_atom; pf->get_ref = get_ref_cache64_ssse3_atom; } } pf->weight_cache = weight_cache_ssse3; pf->weight = mc_weight_wtab_ssse3; if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; if( !(cpu&X264_CPU_SSE4) ) return; pf->integral_init4h = x264_integral_init4h_sse4; pf->integral_init8h = x264_integral_init8h_sse4; if( !(cpu&X264_CPU_AVX) ) return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx; pf->integral_init8h = x264_integral_init8h_avx; pf->hpel_filter = x264_hpel_filter_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; if( cpu&X264_CPU_XOP ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop; if( cpu&X264_CPU_AVX2 ) { pf->hpel_filter = x264_hpel_filter_avx2; pf->mc_chroma = x264_mc_chroma_avx2; pf->weight = mc_weight_wtab_avx2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2; pf->integral_init8v = x264_integral_init8v_avx2; pf->integral_init4v = x264_integral_init4v_avx2; pf->integral_init8h = x264_integral_init8h_avx2; pf->integral_init4h = x264_integral_init4h_avx2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; } if( cpu&X264_CPU_AVX512 ) { pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx512; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512; } #endif // HIGH_BIT_DEPTH if( !(cpu&X264_CPU_AVX) ) return; pf->memcpy_aligned = x264_memcpy_aligned_avx; pf->memzero_aligned = x264_memzero_aligned_avx; pf->plane_copy = plane_copy_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; pf->mbtree_propagate_list = mbtree_propagate_list_avx; if( cpu&X264_CPU_FMA4 ) pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4; if( !(cpu&X264_CPU_AVX2) ) return; pf->plane_copy_swap = plane_copy_swap_avx2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; pf->mbtree_propagate_list = mbtree_propagate_list_avx2; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2; if( !(cpu&X264_CPU_AVX512) ) return; pf->memcpy_aligned = x264_memcpy_aligned_avx512; pf->memzero_aligned = x264_memzero_aligned_avx512; pf->plane_copy = x264_plane_copy_avx512; pf->plane_copy_swap = x264_plane_copy_swap_avx512; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512; #if ARCH_X86_64 pf->mbtree_propagate_list = mbtree_propagate_list_avx512; #endif pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx512; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx512; } x264-master/common/x86/mc.h000066400000000000000000000026331502133446700155500ustar00rootroot00000000000000/***************************************************************************** * mc.h: x86 motion compensation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_MC_H #define X264_X86_MC_H #define x264_mc_init_mmx x264_template(mc_init_mmx) void x264_mc_init_mmx( uint32_t cpu, x264_mc_functions_t *pf ); #endif x264-master/common/x86/pixel-32.asm000066400000000000000000000253211502133446700170440ustar00rootroot00000000000000;***************************************************************************** ;* pixel-32.asm: x86_32 pixel metrics ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Loren Merritt ;* Laurent Aimar ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" cextern pw_ppmmppmm cextern pw_pmpmpmpm SECTION .text INIT_MMX mmx2 %if HIGH_BIT_DEPTH == 0 %macro LOAD_DIFF_4x8P 1 ; dx LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1] LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3] LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1] LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3] LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] movq [spill], m5 LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5] movq m5, [spill] %endmacro %macro SUM4x8_MM 0 movq [spill], m6 movq [spill+8], m7 ABSW2 m0, m1, m0, m1, m6, m7 ABSW2 m2, m3, m2, m3, m6, m7 paddw m0, m2 paddw m1, m3 movq m6, [spill] movq m7, [spill+8] ABSW2 m4, m5, m4, m5, m2, m3 ABSW2 m6, m7, m6, m7, m2, m3 paddw m4, m6 paddw m5, m7 paddw m0, m4 paddw m1, m5 paddw m0, m1 %endmacro ;----------------------------------------------------------------------------- ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal push r0 push r2 sub esp, 0x74 %define args esp+0x74 %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m1 TRANSPOSE4x4W 4, 5, 6, 7, 1 movq [trans+0x00], m4 movq [trans+0x08], m5 movq [trans+0x10], m6 movq [trans+0x18], m7 movq m1, [spill] TRANSPOSE4x4W 0, 1, 2, 3, 4 movq [trans+0x20], m0 movq [trans+0x28], m1 movq [trans+0x30], m2 movq [trans+0x38], m3 mov r0, [args+4] mov r2, [args] LOAD_DIFF_4x8P 4 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m7 TRANSPOSE4x4W 0, 1, 2, 3, 7 movq [trans+0x40], m0 movq [trans+0x48], m1 movq [trans+0x50], m2 movq [trans+0x58], m3 movq m7, [spill] TRANSPOSE4x4W 4, 5, 6, 7, 1 movq m0, [trans+0x00] movq m1, [trans+0x08] movq m2, [trans+0x10] movq m3, [trans+0x18] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 SUM4x8_MM movq [trans], m0 movq m0, [trans+0x20] movq m1, [trans+0x28] movq m2, [trans+0x30] movq m3, [trans+0x38] movq m4, [trans+0x40] movq m5, [trans+0x48] movq m6, [trans+0x50] movq m7, [trans+0x58] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 SUM4x8_MM pavgw m0, [trans] add esp, 0x7c ret %undef args %undef spill %undef trans %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, q1032 pshufw %5, %2, q1032 pshufw %6, %3, q1032 paddusw %1, %4 paddusw %2, %5 paddusw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, q1032 pshufw %5, %2, q1032 pshufw %6, %3, q1032 %8 %1, %4 %8 %2, %5 %8 %3, %6 %endmacro %macro LOAD_4x8P 1 ; dx pxor m7, m7 movd m6, [r0+%1+7*FENC_STRIDE] movd m0, [r0+%1+0*FENC_STRIDE] movd m1, [r0+%1+1*FENC_STRIDE] movd m2, [r0+%1+2*FENC_STRIDE] movd m3, [r0+%1+3*FENC_STRIDE] movd m4, [r0+%1+4*FENC_STRIDE] movd m5, [r0+%1+5*FENC_STRIDE] punpcklbw m6, m7 punpcklbw m0, m7 punpcklbw m1, m7 movq [spill], m6 punpcklbw m2, m7 punpcklbw m3, m7 movd m6, [r0+%1+6*FENC_STRIDE] punpcklbw m4, m7 punpcklbw m5, m7 punpcklbw m6, m7 movq m7, [spill] %endmacro %macro HSUMSUB2 4 pshufw m4, %1, %3 pshufw m5, %2, %3 pmullw %1, %4 pmullw m5, %4 paddw %1, m4 paddw %2, m5 %endmacro ;----------------------------------------------------------------------------- ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- cglobal intra_sa8d_x3_8x8, 2,3 SUB esp, 0x94 %define edge esp+0x70 ; +32 %define spill esp+0x60 ; +16 %define trans esp+0 ; +96 %define sum esp+0 ; +32 pxor m7, m7 movq m0, [r1+7] movq m2, [r1+16] movq m1, m0 movq m3, m2 punpcklbw m0, m7 punpckhbw m1, m7 punpcklbw m2, m7 punpckhbw m3, m7 movq m6, [pw_ppmmppmm] HSUMSUB2 m0, m2, q1032, m6 HSUMSUB2 m1, m3, q1032, m6 movq m6, [pw_pmpmpmpm] HSUMSUB2 m0, m2, q2301, m6 HSUMSUB2 m1, m3, q2301, m6 movq m4, m0 movq m5, m2 paddw m0, m1 paddw m2, m3 psubw m4, m1 psubw m3, m5 movq [edge+0], m0 movq [edge+8], m4 movq [edge+16], m2 movq [edge+24], m3 LOAD_4x8P 0 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m0 TRANSPOSE4x4W 4, 5, 6, 7, 0 movq [trans+0x00], m4 movq [trans+0x08], m5 movq [trans+0x10], m6 movq [trans+0x18], m7 movq m0, [spill] TRANSPOSE4x4W 0, 1, 2, 3, 4 movq [trans+0x20], m0 movq [trans+0x28], m1 movq [trans+0x30], m2 movq [trans+0x38], m3 LOAD_4x8P 4 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill], m7 TRANSPOSE4x4W 0, 1, 2, 3, 7 movq [trans+0x40], m0 movq [trans+0x48], m1 movq [trans+0x50], m2 movq [trans+0x58], m3 movq m7, [spill] TRANSPOSE4x4W 4, 5, 6, 7, 0 movq m0, [trans+0x00] movq m1, [trans+0x08] movq m2, [trans+0x10] movq m3, [trans+0x18] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movq [spill+0], m0 movq [spill+8], m1 ABSW2 m2, m3, m2, m3, m0, m1 ABSW2 m4, m5, m4, m5, m0, m1 paddw m2, m4 paddw m3, m5 ABSW2 m6, m7, m6, m7, m4, m5 movq m0, [spill+0] movq m1, [spill+8] paddw m2, m6 paddw m3, m7 paddw m2, m3 ABSW m1, m1, m4 paddw m2, m1 ; 7x4 sum movq m7, m0 movq m1, [edge+8] ; left bottom psllw m1, 3 psubw m7, m1 ABSW2 m0, m7, m0, m7, m5, m3 paddw m0, m2 paddw m7, m2 movq [sum+0], m0 ; dc movq [sum+8], m7 ; left movq m0, [trans+0x20] movq m1, [trans+0x28] movq m2, [trans+0x30] movq m3, [trans+0x38] movq m4, [trans+0x40] movq m5, [trans+0x48] movq m6, [trans+0x50] movq m7, [trans+0x58] HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 movd [sum+0x10], m0 movd [sum+0x12], m1 movd [sum+0x14], m2 movd [sum+0x16], m3 movd [sum+0x18], m4 movd [sum+0x1a], m5 movd [sum+0x1c], m6 movd [sum+0x1e], m7 movq [spill], m0 movq [spill+8], m1 ABSW2 m2, m3, m2, m3, m0, m1 ABSW2 m4, m5, m4, m5, m0, m1 paddw m2, m4 paddw m3, m5 paddw m2, m3 movq m0, [spill] movq m1, [spill+8] ABSW2 m6, m7, m6, m7, m4, m5 ABSW m1, m1, m3 paddw m2, m7 paddw m1, m6 paddw m2, m1 ; 7x4 sum movq m1, m0 movq m7, [edge+0] psllw m7, 3 ; left top mov r2, [edge+0] add r2, [edge+16] lea r2, [4*r2+32] and r2, 0xffc0 movd m6, r2 ; dc psubw m1, m7 psubw m0, m6 ABSW2 m0, m1, m0, m1, m5, m6 movq m3, [sum+0] ; dc paddw m0, m2 paddw m1, m2 movq m2, m0 paddw m0, m3 paddw m1, [sum+8] ; h psrlq m2, 16 paddw m2, m3 movq m3, [edge+16] ; top left movq m4, [edge+24] ; top right psllw m3, 3 psllw m4, 3 psubw m3, [sum+16] psubw m4, [sum+24] ABSW2 m3, m4, m3, m4, m5, m6 paddw m2, m3 paddw m2, m4 ; v SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw mov r2, r2m pxor m7, m7 punpckldq m2, m1 pavgw m0, m7 pavgw m2, m7 movd [r2+8], m0 ; dc movq [r2+0], m2 ; v, h ADD esp, 0x94 RET %undef edge %undef spill %undef trans %undef sum ;----------------------------------------------------------------------------- ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal pixel_ssim_4x4x2_core, 0,5 mov r1, r1m mov r3, r3m mov r4, 4 pxor m0, m0 .loop: mov r0, r0m mov r2, r2m add r0, r4 add r2, r4 pxor m1, m1 pxor m2, m2 pxor m3, m3 pxor m4, m4 %rep 4 movd m5, [r0] movd m6, [r2] punpcklbw m5, m0 punpcklbw m6, m0 paddw m1, m5 paddw m2, m6 movq m7, m5 pmaddwd m5, m5 pmaddwd m7, m6 pmaddwd m6, m6 paddd m3, m5 paddd m4, m7 paddd m3, m6 add r0, r1 add r2, r3 %endrep mov r0, r4m lea r0, [r0+r4*4] pshufw m5, m1, q0032 pshufw m6, m2, q0032 paddusw m1, m5 paddusw m2, m6 punpcklwd m1, m2 pshufw m2, m1, q0032 pshufw m5, m3, q0032 pshufw m6, m4, q0032 paddusw m1, m2 paddd m3, m5 paddd m4, m6 punpcklwd m1, m0 punpckldq m3, m4 movq [r0+0], m1 movq [r0+8], m3 sub r4, 4 jge .loop emms RET %endif ; !HIGH_BIT_DEPTH x264-master/common/x86/pixel-a.asm000066400000000000000000004332571502133446700170530ustar00rootroot00000000000000;***************************************************************************** ;* pixel.asm: x86 pixel metrics ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Laurent Aimar ;* Alex Izvorski ;* Fiona Glaser ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1 db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1 hmul_16p: times 16 db 1 times 8 db 1, -1 hmul_8p: times 8 db 1 times 4 db 1, -1 times 8 db 1 times 4 db 1, -1 mask_ff: times 16 db 0xff times 16 db 0 mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1 mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1 mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1 %if HIGH_BIT_DEPTH ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 %endif %if BIT_DEPTH == 10 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 pf_64: times 4 dd 64.0 pf_128: times 4 dd 128.0 %elif BIT_DEPTH == 9 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 %else ; 8-bit ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 %endif hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 pb_pppm: times 4 db 1,1,1,-1 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0 intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6 intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4 intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0 intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11 intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8 intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9 intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3 intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1 intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1 intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0 pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007 pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007 intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15 intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6 intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4 intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0 intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11 intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8 intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9 intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3 intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1 intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1 intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1 intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1 intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0 ALIGN 32 intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5 intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4 intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1 intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0 intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10 intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11 intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14 intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15 intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9 intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10 intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11 intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13 intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12 intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9 intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8 intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14 intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14 intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12 intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12 intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10 intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8 intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10 intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8 intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2 intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0 intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15 intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15 pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003 pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 sw_f0: dq 0xfff0, 0 pd_f0: times 4 dd 0xffff0000 pd_2: times 4 dd 2 pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7 ads_mvs_shuffle: %macro ADS_MVS_SHUFFLE 8 %assign y x %rep 8 %rep 7 %rotate (~y)&1 %assign y y>>((~y)&1) %endrep db %1*2, %1*2+1 %rotate 1 %assign y y>>1 %endrep %endmacro %assign x 0 %rep 256 ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7 %assign x x+1 %endrep SECTION .text cextern pb_0 cextern pb_1 cextern pw_1 cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_00ff cextern pw_ppppmmmm cextern pw_ppmmppmm cextern pw_pmpmpmpm cextern pw_pmmpzzzz cextern pd_1 cextern hsub_mul cextern popcnt_table ;============================================================================= ; SSD ;============================================================================= %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 cglobal pixel_ssd_%1x%2, 4,7,6 FIX_STRIDES r1, r3 %if mmsize == %1*2 %define offset0_1 r1 %define offset0_2 r1*2 %define offset0_3 r5 %define offset1_1 r3 %define offset1_2 r3*2 %define offset1_3 r6 lea r5, [3*r1] lea r6, [3*r3] %elif mmsize == %1 %define offset0_1 mmsize %define offset0_2 r1 %define offset0_3 r1+mmsize %define offset1_1 mmsize %define offset1_2 r3 %define offset1_3 r3+mmsize %elif mmsize == %1/2 %define offset0_1 mmsize %define offset0_2 mmsize*2 %define offset0_3 mmsize*3 %define offset1_1 mmsize %define offset1_2 mmsize*2 %define offset1_3 mmsize*3 %endif %assign %%n %2/(2*mmsize/%1) %if %%n > 1 mov r4d, %%n %endif pxor m0, m0 .loop: mova m1, [r0] mova m2, [r0+offset0_1] mova m3, [r0+offset0_2] mova m4, [r0+offset0_3] psubw m1, [r2] psubw m2, [r2+offset1_1] psubw m3, [r2+offset1_2] psubw m4, [r2+offset1_3] %if %%n > 1 lea r0, [r0+r1*(%2/%%n)] lea r2, [r2+r3*(%2/%%n)] %endif pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 %if %%n > 1 dec r4d jg .loop %endif HADDD m0, m5 movd eax, xm0 RET %endmacro INIT_MMX mmx2 SSD_ONE 4, 4 SSD_ONE 4, 8 SSD_ONE 4, 16 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 16, 8 SSD_ONE 16, 16 INIT_XMM sse2 SSD_ONE 8, 4 SSD_ONE 8, 8 SSD_ONE 8, 16 SSD_ONE 16, 8 SSD_ONE 16, 16 INIT_YMM avx2 SSD_ONE 16, 8 SSD_ONE 16, 16 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 %macro SSD_LOAD_FULL 5 mova m1, [t0+%1] mova m2, [t2+%2] mova m3, [t0+%3] mova m4, [t2+%4] %if %5==1 add t0, t1 add t2, t3 %elif %5==2 lea t0, [t0+2*t1] lea t2, [t2+2*t3] %endif %endmacro %macro LOAD 5 movh m%1, %3 movh m%2, %4 %if %5 lea t0, [t0+2*t1] %endif %endmacro %macro JOIN 7 movh m%3, %5 movh m%4, %6 %if %7 lea t2, [t2+2*t3] %endif punpcklbw m%1, m7 punpcklbw m%3, m7 psubw m%1, m%3 punpcklbw m%2, m7 punpcklbw m%4, m7 psubw m%2, m%4 %endmacro %macro JOIN_SSE2 7 movh m%3, %5 movh m%4, %6 %if %7 lea t2, [t2+2*t3] %endif punpcklqdq m%1, m%2 punpcklqdq m%3, m%4 DEINTB %2, %1, %4, %3, 7 psubw m%2, m%4 psubw m%1, m%3 %endmacro %macro JOIN_SSSE3 7 movh m%3, %5 movh m%4, %6 %if %7 lea t2, [t2+2*t3] %endif punpcklbw m%1, m%3 punpcklbw m%2, m%4 %endmacro %macro LOAD_AVX2 5 mova xm%1, %3 vinserti128 m%1, m%1, %4, 1 %if %5 lea t0, [t0+2*t1] %endif %endmacro %macro JOIN_AVX2 7 mova xm%2, %5 vinserti128 m%2, m%2, %6, 1 %if %7 lea t2, [t2+2*t3] %endif SBUTTERFLY bw, %1, %2, %3 %endmacro %macro SSD_LOAD_HALF 5 LOAD 1, 2, [t0+%1], [t0+%3], 1 JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 LOAD 3, 4, [t0+%1], [t0+%3], %5 JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 %endmacro %macro SSD_CORE 7-8 %ifidn %8, FULL mova m%6, m%2 mova m%7, m%4 psubusb m%2, m%1 psubusb m%4, m%3 psubusb m%1, m%6 psubusb m%3, m%7 por m%1, m%2 por m%3, m%4 punpcklbw m%2, m%1, m%5 punpckhbw m%1, m%5 punpcklbw m%4, m%3, m%5 punpckhbw m%3, m%5 %endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4 %endmacro %macro SSD_CORE_SSE2 7-8 %ifidn %8, FULL DEINTB %6, %1, %7, %2, %5 psubw m%6, m%7 psubw m%1, m%2 SWAP %6, %2, %1 DEINTB %6, %3, %7, %4, %5 psubw m%6, m%7 psubw m%3, m%4 SWAP %6, %4, %3 %endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4 %endmacro %macro SSD_CORE_SSSE3 7-8 %ifidn %8, FULL punpckhbw m%6, m%1, m%2 punpckhbw m%7, m%3, m%4 punpcklbw m%1, m%2 punpcklbw m%3, m%4 SWAP %6, %2, %3 SWAP %7, %4 %endif pmaddubsw m%1, m%5 pmaddubsw m%2, m%5 pmaddubsw m%3, m%5 pmaddubsw m%4, m%5 pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4 %endmacro %macro SSD_ITER 6 SSD_LOAD_%1 %2,%3,%4,%5,%6 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 %endmacro ;----------------------------------------------------------------------------- ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD 2 %if %1 != %2 %assign function_align 8 %else %assign function_align 16 %endif cglobal pixel_ssd_%1x%2, 0,0,0 mov al, %1*%2/mmsize/2 %if %1 != %2 jmp mangle(private_prefix %+ _pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop) %else .startloop: %if ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3 PROLOGUE 0,0,8 %else PROLOGUE 0,5 DECLARE_REG_TMP 1,2,3,4 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m %endif %if cpuflag(ssse3) mova m7, [hsub_mul] %elifidn cpuname, sse2 mova m7, [pw_00ff] %elif %1 >= mmsize pxor m7, m7 %endif pxor m0, m0 ALIGN 16 .loop: %if %1 > mmsize SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 %elif %1 == mmsize SSD_ITER FULL, 0, 0, t1, t3, 2 %else SSD_ITER HALF, 0, 0, t1, t3, 2 %endif dec al jg .loop %if mmsize==32 vextracti128 xm1, m0, 1 paddd xm0, xm1 HADDD xm0, xm1 movd eax, xm0 %else HADDD m0, m1 movd eax, m0 %endif RET %endif %endmacro INIT_MMX mmx SSD 16, 16 SSD 16, 8 SSD 8, 8 SSD 8, 16 SSD 4, 4 SSD 8, 4 SSD 4, 8 SSD 4, 16 INIT_XMM sse2slow SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 INIT_XMM sse2 %define SSD_CORE SSD_CORE_SSE2 %define JOIN JOIN_SSE2 SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 INIT_XMM ssse3 %define SSD_CORE SSD_CORE_SSSE3 %define JOIN JOIN_SSSE3 SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 INIT_XMM avx SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 INIT_MMX ssse3 SSD 4, 4 SSD 4, 8 SSD 4, 16 INIT_XMM xop SSD 16, 16 SSD 8, 8 SSD 16, 8 SSD 8, 16 SSD 8, 4 %define LOAD LOAD_AVX2 %define JOIN JOIN_AVX2 INIT_YMM avx2 SSD 16, 16 SSD 16, 8 %assign function_align 16 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; The maximum width this function can handle without risk of overflow is given ; in the following equation: (mmsize in bits) ; ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2 ; ; For 10-bit XMM this means width >= 32832. At sane distortion levels ; it will take much more than that though. ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7,7 shl r4d, 2 FIX_STRIDES r1, r3 add r0, r4 add r2, r4 neg r4 pxor m4, m4 pxor m5, m5 %if mmsize == 32 vbroadcasti128 m6, [ssd_nv12_shuf] %endif .loopy: mov r6, r4 pxor m2, m2 pxor m3, m3 .loopx: mova m0, [r0+r6] mova m1, [r0+r6+mmsize] psubw m0, [r2+r6] psubw m1, [r2+r6+mmsize] %if mmsize == 32 pshufb m0, m6 pshufb m1, m6 %else SBUTTERFLY wd, 0, 1, 6 %endif %if cpuflag(xop) pmadcswd m2, m0, m0, m2 pmadcswd m3, m1, m1, m3 %else pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 %endif add r6, 2*mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled jz .no_overread psubd m3, m1 .no_overread: %endif punpckhdq m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the punpckhdq m1, m3, m5 ; equation above, putting the width limit at 8208 punpckldq m2, m5 punpckldq m3, m5 paddq m0, m1 paddq m2, m3 paddq m4, m0 paddq m4, m2 add r0, r1 add r2, r3 dec r5d jg .loopy mov r0, r6m mov r1, r7m %if mmsize == 32 vextracti128 xm0, m4, 1 paddq xm4, xm0 %endif movq [r0], xm4 movhps [r1], xm4 RET %endmacro ; SSD_NV12 %else ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; This implementation can potentially overflow on image widths >= 11008 (or ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp ; 20). At sane distortion levels it will take much more than that though. ;----------------------------------------------------------------------------- %macro SSD_NV12 0 cglobal pixel_ssd_nv12_core, 6,7 add r4d, r4d add r0, r4 add r2, r4 neg r4 pxor m3, m3 pxor m4, m4 mova m5, [pw_00ff] .loopy: mov r6, r4 .loopx: %if mmsize == 32 ; only 16-byte alignment is guaranteed movu m2, [r0+r6] movu m1, [r2+r6] %else mova m2, [r0+r6] mova m1, [r2+r6] %endif psubusb m0, m2, m1 psubusb m1, m2 por m0, m1 psrlw m2, m0, 8 pand m0, m5 %if cpuflag(xop) pmadcswd m4, m2, m2, m4 pmadcswd m3, m0, m0, m3 %else pmaddwd m2, m2 pmaddwd m0, m0 paddd m4, m2 paddd m3, m0 %endif add r6, mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled jz .no_overread pcmpeqb xm1, xm1 pandn m0, m1, m0 ; zero the lower half pandn m2, m1, m2 psubd m3, m0 psubd m4, m2 .no_overread: %endif add r0, r1 add r2, r3 dec r5d jg .loopy mov r0, r6m mov r1, r7m %if cpuflag(ssse3) phaddd m3, m4 %else SBUTTERFLY qdq, 3, 4, 0 paddd m3, m4 %endif %if mmsize == 32 vextracti128 xm4, m3, 1 paddd xm3, xm4 %endif psllq xm4, xm3, 32 paddd xm3, xm4 psrlq xm3, 32 movq [r0], xm3 movhps [r1], xm3 RET %endmacro ; SSD_NV12 %endif ; !HIGH_BIT_DEPTH INIT_XMM sse2 SSD_NV12 INIT_XMM avx SSD_NV12 INIT_XMM xop SSD_NV12 INIT_YMM avx2 SSD_NV12 ;============================================================================= ; variance ;============================================================================= %macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] %elif mmsize == 16 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro %macro VAR_END 0 pmaddwd m5, [pw_1] SBUTTERFLY dq, 5, 6, 0 paddd m5, m6 %if mmsize == 32 vextracti128 xm6, m5, 1 paddd xm5, xm6 %endif MOVHL xm6, xm5 paddd xm5, xm6 %if ARCH_X86_64 movq rax, xm5 %else movd eax, xm5 %if cpuflag(avx) pextrd edx, xm5, 1 %else pshuflw xm5, xm5, q1032 movd edx, xm5 %endif %endif RET %endmacro %macro VAR_CORE 0 paddw m5, m0 paddw m5, m3 paddw m5, m1 paddw m5, m4 pmaddwd m0, m0 pmaddwd m3, m3 pmaddwd m1, m1 pmaddwd m4, m4 paddd m6, m0 paddd m6, m3 paddd m6, m1 paddd m6, m4 %endmacro ;----------------------------------------------------------------------------- ; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 VAR_START 0 mov r2d, 8 .loop: mova m0, [r0] mova m1, [r0+mmsize] mova m3, [r0+r1] mova m4, [r0+r1+mmsize] lea r0, [r0+r1*2] VAR_CORE dec r2d jg .loop VAR_END cglobal pixel_var_8x8, 2,3,8 lea r2, [r1*3] VAR_START 0 mova m0, [r0] mova m1, [r0+r1*2] mova m3, [r0+r1*4] mova m4, [r0+r2*2] lea r0, [r0+r1*8] VAR_CORE mova m0, [r0] mova m1, [r0+r1*2] mova m3, [r0+r1*4] mova m4, [r0+r2*2] VAR_CORE VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR %else ; HIGH_BIT_DEPTH == 0 %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 VAR_START 1 mov r2d, 8 .loop: mova m0, [r0] mova m3, [r0+r1] DEINTB 1, 0, 4, 3, 7 lea r0, [r0+r1*2] VAR_CORE dec r2d jg .loop VAR_END cglobal pixel_var_8x8, 2,4,8 VAR_START 1 mov r2d, 2 lea r3, [r1*3] .loop: movh m0, [r0] movh m3, [r0+r1] movhps m0, [r0+r1*2] movhps m3, [r0+r3] DEINTB 1, 0, 4, 3, 7 lea r0, [r0+r1*4] VAR_CORE dec r2d jg .loop VAR_END cglobal pixel_var_8x16, 2,4,8 VAR_START 1 mov r2d, 4 lea r3, [r1*3] .loop: movh m0, [r0] movh m3, [r0+r1] movhps m0, [r0+r1*2] movhps m3, [r0+r3] DEINTB 1, 0, 4, 3, 7 lea r0, [r0+r1*4] VAR_CORE dec r2d jg .loop VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR %endif ; !HIGH_BIT_DEPTH INIT_YMM avx2 cglobal pixel_var_16x16, 2,4,7 FIX_STRIDES r1 VAR_START 0 mov r2d, 4 lea r3, [r1*3] .loop: %if HIGH_BIT_DEPTH mova m0, [r0] mova m3, [r0+r1] mova m1, [r0+r1*2] mova m4, [r0+r3] %else pmovzxbw m0, [r0] pmovzxbw m3, [r0+r1] pmovzxbw m1, [r0+r1*2] pmovzxbw m4, [r0+r3] %endif lea r0, [r0+r1*4] VAR_CORE dec r2d jg .loop VAR_END %macro VAR_AVX512_CORE 1 ; accum %if %1 paddw m0, m2 pmaddwd m2, m2 paddw m0, m3 pmaddwd m3, m3 paddd m1, m2 paddd m1, m3 %else paddw m0, m2, m3 pmaddwd m2, m2 pmaddwd m3, m3 paddd m1, m2, m3 %endif %endmacro %macro VAR_AVX512_CORE_16x16 1 ; accum %if HIGH_BIT_DEPTH mova ym2, [r0] vinserti64x4 m2, [r0+r1], 1 mova ym3, [r0+2*r1] vinserti64x4 m3, [r0+r3], 1 %else vbroadcasti64x2 ym2, [r0] vbroadcasti64x2 m2 {k1}, [r0+r1] vbroadcasti64x2 ym3, [r0+2*r1] vbroadcasti64x2 m3 {k1}, [r0+r3] pshufb m2, m4 pshufb m3, m4 %endif VAR_AVX512_CORE %1 %endmacro %macro VAR_AVX512_CORE_8x8 1 ; accum %if HIGH_BIT_DEPTH mova xm2, [r0] mova xm3, [r0+r1] %else movq xm2, [r0] movq xm3, [r0+r1] %endif vinserti128 ym2, [r0+2*r1], 1 vinserti128 ym3, [r0+r2], 1 lea r0, [r0+4*r1] vinserti32x4 m2, [r0], 2 vinserti32x4 m3, [r0+r1], 2 vinserti32x4 m2, [r0+2*r1], 3 vinserti32x4 m3, [r0+r2], 3 %if HIGH_BIT_DEPTH == 0 punpcklbw m2, m4 punpcklbw m3, m4 %endif VAR_AVX512_CORE %1 %endmacro INIT_ZMM avx512 cglobal pixel_var_16x16, 2,4 FIX_STRIDES r1 mov r2d, 0xf0 lea r3, [3*r1] %if HIGH_BIT_DEPTH == 0 vbroadcasti64x4 m4, [var_shuf_avx512] kmovb k1, r2d %endif VAR_AVX512_CORE_16x16 0 .loop: lea r0, [r0+4*r1] VAR_AVX512_CORE_16x16 1 sub r2d, 0x50 jg .loop %if ARCH_X86_64 == 0 pop r3d %assign regs_used 3 %endif var_avx512_end: vbroadcasti32x4 m2, [pw_1] pmaddwd m0, m2 SBUTTERFLY dq, 0, 1, 2 paddd m0, m1 vextracti32x8 ym1, m0, 1 paddd ym0, ym1 vextracti128 xm1, ym0, 1 paddd xmm0, xm0, xm1 punpckhqdq xmm1, xmm0, xmm0 paddd xmm0, xmm1 %if ARCH_X86_64 movq rax, xmm0 %else movd eax, xmm0 pextrd edx, xmm0, 1 %endif RET %if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth cglobal pixel_var_8x8, 2,3 lea r2, [3*r1] pxor xm4, xm4 VAR_AVX512_CORE_8x8 0 jmp var_avx512_end %endif cglobal pixel_var_8x16, 2,3 FIX_STRIDES r1 lea r2, [3*r1] %if HIGH_BIT_DEPTH == 0 pxor xm4, xm4 %endif VAR_AVX512_CORE_8x8 0 lea r0, [r0+4*r1] VAR_AVX512_CORE_8x8 1 jmp var_avx512_end ;----------------------------------------------------------------------------- ; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 2 %endif %macro VAR2_END 3 ; src, tmp, shift movifnidn r2, r2mp pshufd %2, %1, q3331 pmuludq %1, %1 movq [r2], %2 ; sqr_u sqr_v psrld %1, %3 psubd %2, %1 ; sqr - (sum * sum >> shift) MOVHL %1, %2 paddd %1, %2 movd eax, %1 RET %endmacro %macro VAR2_8x8_SSE2 2 %if HIGH_BIT_DEPTH cglobal pixel_var2_8x%1, 2,3,6 pxor m4, m4 pxor m5, m5 %define %%sum2 m4 %define %%sqr2 m5 %else cglobal pixel_var2_8x%1, 2,3,7 mova m6, [pw_00ff] %define %%sum2 m0 %define %%sqr2 m1 %endif pxor m0, m0 ; sum pxor m1, m1 ; sqr mov t0d, (%1-1)*FENC_STRIDEB .loop: %if HIGH_BIT_DEPTH mova m2, [r0+1*t0] psubw m2, [r1+2*t0] mova m3, [r0+1*t0+16] psubw m3, [r1+2*t0+32] %else mova m3, [r0+1*t0] movq m5, [r1+2*t0] punpcklqdq m5, [r1+2*t0+16] DEINTB 2, 3, 4, 5, 6 psubw m2, m4 psubw m3, m5 %endif paddw m0, m2 pmaddwd m2, m2 paddw %%sum2, m3 pmaddwd m3, m3 paddd m1, m2 paddd %%sqr2, m3 sub t0d, FENC_STRIDEB jge .loop %if HIGH_BIT_DEPTH SBUTTERFLY dq, 0, 4, 2 paddw m0, m4 ; sum_u sum_v pmaddwd m0, [pw_1] SBUTTERFLY dq, 1, 5, 2 paddd m1, m5 ; sqr_u sqr_v SBUTTERFLY dq, 0, 1, 2 paddd m0, m1 %else pmaddwd m0, [pw_1] shufps m2, m0, m1, q2020 shufps m0, m1, q3131 paddd m0, m2 pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v %endif VAR2_END m0, m1, %2 %endmacro INIT_XMM sse2 VAR2_8x8_SSE2 8, 6 VAR2_8x8_SSE2 16, 7 %macro VAR2_CORE 3 ; src1, src2, accum %if %3 paddw m0, %1 pmaddwd %1, %1 paddw m0, %2 pmaddwd %2, %2 paddd m1, %1 paddd m1, %2 %else paddw m0, %1, %2 pmaddwd %1, %1 pmaddwd %2, %2 paddd m1, %1, %2 %endif %endmacro %if HIGH_BIT_DEPTH == 0 INIT_XMM ssse3 cglobal pixel_var2_internal pxor m0, m0 ; sum pxor m1, m1 ; sqr .loop: movq m2, [r0+1*t0] punpcklbw m2, [r1+2*t0] movq m3, [r0+1*t0-1*FENC_STRIDE] punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE] movq m4, [r0+1*t0-2*FENC_STRIDE] punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE] movq m5, [r0+1*t0-3*FENC_STRIDE] punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE] pmaddubsw m2, m7 pmaddubsw m3, m7 pmaddubsw m4, m7 pmaddubsw m5, m7 VAR2_CORE m2, m3, 1 VAR2_CORE m4, m5, 1 sub t0d, 4*FENC_STRIDE jg .loop pmaddwd m0, [pw_1] ret %macro VAR2_8x8_SSSE3 2 cglobal pixel_var2_8x%1, 2,3,8 mova m7, [hsub_mul] mov t0d, (%1-1)*FENC_STRIDE call pixel_var2_internal_ssse3 ; u add r0, 8 add r1, 16 SBUTTERFLY qdq, 0, 1, 6 paddd m1, m0 mov t0d, (%1-1)*FENC_STRIDE call pixel_var2_internal_ssse3 ; v SBUTTERFLY qdq, 0, 6, 2 paddd m0, m6 phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v VAR2_END m1, m0, %2 %endmacro VAR2_8x8_SSSE3 8, 6 VAR2_8x8_SSSE3 16, 7 %endif ; !HIGH_BIT_DEPTH %macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset %if HIGH_BIT_DEPTH %if mmsize == 64 mova m2, [r1+2*%1+%2*FDEC_STRIDEB] vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020 mova m3, [r1+2*%1+%3*FDEC_STRIDEB] vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020 %else mova xm2, [r1+2*%1+%2*FDEC_STRIDEB] vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1 mova xm3, [r1+2*%1+%3*FDEC_STRIDEB] vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1 %endif psubw m2, [r0+1*%1+%2*FENC_STRIDEB] psubw m3, [r0+1*%1+%3*FENC_STRIDEB] %else pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE] mova m4, [r1+2*%1+%2*FDEC_STRIDE] pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE] mova m5, [r1+2*%1+%3*FDEC_STRIDE] punpcklbw m4, m6 punpcklbw m5, m6 psubw m2, m4 psubw m3, m5 %endif %endmacro %macro VAR2_8x8_AVX2 2 %if HIGH_BIT_DEPTH cglobal pixel_var2_8x%1, 2,3,4 %else cglobal pixel_var2_8x%1, 2,3,7 pxor m6, m6 %endif mov t0d, (%1-3)*FENC_STRIDEB VAR2_AVX2_LOAD t0, 2, 1 VAR2_CORE m2, m3, 0 .loop: VAR2_AVX2_LOAD t0, 0, -1 VAR2_CORE m2, m3, 1 sub t0d, 2*FENC_STRIDEB jg .loop pmaddwd m0, [pw_1] SBUTTERFLY qdq, 0, 1, 2 paddd m0, m1 vextracti128 xm1, m0, 1 phaddd xm0, xm1 VAR2_END xm0, xm1, %2 %endmacro INIT_YMM avx2 VAR2_8x8_AVX2 8, 6 VAR2_8x8_AVX2 16, 7 %macro VAR2_AVX512_END 1 ; shift vbroadcasti32x4 m2, [pw_1] pmaddwd m0, m2 SBUTTERFLY qdq, 0, 1, 2 paddd m0, m1 vextracti32x8 ym1, m0, 1 paddd ym0, ym1 psrlq ym1, ym0, 32 paddd ym0, ym1 vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v VAR2_END xmm0, xmm1, %1 %endmacro INIT_ZMM avx512 cglobal pixel_var2_8x8, 2,3 %if HIGH_BIT_DEPTH == 0 pxor xm6, xm6 %endif VAR2_AVX2_LOAD 0, 0, 2 VAR2_CORE m2, m3, 0 VAR2_AVX2_LOAD 0, 4, 6 VAR2_CORE m2, m3, 1 VAR2_AVX512_END 6 cglobal pixel_var2_8x16, 2,3 %if HIGH_BIT_DEPTH == 0 pxor xm6, xm6 %endif mov t0d, 10*FENC_STRIDEB VAR2_AVX2_LOAD 0, 14, 12 VAR2_CORE m2, m3, 0 .loop: VAR2_AVX2_LOAD t0, 0, -2 VAR2_CORE m2, m3, 1 sub t0d, 4*FENC_STRIDEB jg .loop VAR2_AVX512_END 7 ;============================================================================= ; SATD ;============================================================================= %macro JDUP 2 %if cpuflag(sse4) ; just use shufps on anything post conroe shufps %1, %2, 0 %elif cpuflag(ssse3) && notcpuflag(atom) ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 movsldup %1, %1 %else ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d punpckldq %1, %2 %endif %endmacro %macro HSUMSUB 5 pmaddubsw m%2, m%5 pmaddubsw m%1, m%5 pmaddubsw m%4, m%5 pmaddubsw m%3, m%5 %endmacro %macro DIFF_UNPACK_SSE2 5 punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro DIFF_SUMSUB_SSSE3 5 HSUMSUB %1, %2, %3, %4, %5 psubw m%1, m%2 psubw m%3, m%4 %endmacro %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer movd %1, %3 movd %2, %4 JDUP %1, %2 %endmacro %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer movddup m%3, %6 movddup m%4, %8 movddup m%1, %5 movddup m%2, %7 %endmacro %macro LOAD_DUP_4x8P_PENRYN 8 ; penryn and nehalem run punpcklqdq and movddup in different units movh m%3, %6 movh m%4, %8 punpcklqdq m%3, m%3 movddup m%1, %5 punpcklqdq m%4, m%4 movddup m%2, %7 %endmacro %macro LOAD_SUMSUB_8x2P 9 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr movddup m%1, [%7] movddup m%2, [%7+8] mova m%4, [%6] movddup m%3, m%4 punpckhqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr movu m%4, [%7] mova m%2, [%6] DEINTB %1, %2, %3, %4, %5 psubw m%1, m%3 psubw m%2, m%4 SUMSUB_BA w, %1, %2, %3 %endmacro %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5 %endmacro %macro LOAD_SUMSUB_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr vbroadcasti128 m%1, [%6] vbroadcasti128 m%3, [%7] vbroadcasti128 m%2, [%8] vbroadcasti128 m%4, [%9] DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3 LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer mova xm%3, %6 mova xm%4, %8 mova xm%1, %5 mova xm%2, %7 vpermq m%3, m%3, q0011 vpermq m%4, m%4, q0011 vpermq m%1, m%1, q0011 vpermq m%2, m%2, q0011 %endmacro %macro LOAD_SUMSUB8_16x2P_AVX2 9 ; 2*dst, 2*tmp, mul, 4*ptr LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5 %endmacro %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5] %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ; in: r4=3*stride1, r5=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 %xdefine %%n nn%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset] LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset] %if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif HADAMARD4_2D 4, 5, 6, 7, 3, %%n paddw m4, m6 SWAP %%n, 4 %endmacro ; in: %1 = horizontal if 0, vertical if 1 %macro SATD_8x4_SSE 8-9 %if %1 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax %else HADAMARD4_V %2, %3, %4, %5, %6 ; doing the abs first is a slight advantage ABSW2 m%2, m%4, m%2, m%4, m%6, m%7 ABSW2 m%3, m%5, m%3, m%5, m%6, m%7 HADAMARD 1, max, %2, %4, %6, %7 %endif %ifnidn %9, swap paddw m%8, m%2 %else SWAP %8, %2 %endif %if %1 paddw m%8, m%4 %else HADAMARD 1, max, %3, %5, %6, %7 paddw m%8, m%3 %endif %endmacro %macro SATD_START_MMX 0 FIX_STRIDES r1, r3 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2 %endmacro %macro SATD_END_MMX 0 %if HIGH_BIT_DEPTH HADDUW m0, m1 movd eax, m0 %else ; !HIGH_BIT_DEPTH pshufw m1, m0, q1032 paddw m0, m1 pshufw m1, m0, q2301 paddw m0, m1 movd eax, m0 and eax, 0xffff %endif ; HIGH_BIT_DEPTH RET %endmacro ; FIXME avoid the spilling of regs to hold 3*stride. ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_satd_16x4_internal SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 SATD_4x4_MMX m2, 8, 0 paddw m0, m1 SATD_4x4_MMX m1, 12, 0 paddw m0, m2 paddw m0, m1 ret cglobal pixel_satd_8x8_internal SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 1 paddw m0, m2 paddw m0, m1 pixel_satd_8x4_internal_mmx2: SATD_4x4_MMX m2, 0, 0 SATD_4x4_MMX m1, 4, 0 paddw m0, m2 paddw m0, m1 ret %if HIGH_BIT_DEPTH %macro SATD_MxN_MMX 3 cglobal pixel_satd_%1x%2, 4,7 SATD_START_MMX pxor m0, m0 call pixel_satd_%1x%3_internal_mmx2 HADDUW m0, m1 movd r6d, m0 %rep %2/%3-1 pxor m0, m0 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_%1x%3_internal_mmx2 movd m2, r4 HADDUW m0, m1 movd r4, m0 add r6, r4 movd r4, m2 %endrep movifnidn eax, r6d RET %endmacro SATD_MxN_MMX 16, 16, 4 SATD_MxN_MMX 16, 8, 4 SATD_MxN_MMX 8, 16, 8 %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 cglobal pixel_satd_16x16, 4,6 SATD_START_MMX pxor m0, m0 %rep 3 call pixel_satd_16x4_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endrep call pixel_satd_16x4_internal_mmx2 HADDUW m0, m1 movd eax, m0 RET cglobal pixel_satd_16x8, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_16x4_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_16x4_internal_mmx2 SATD_END_MMX cglobal pixel_satd_8x16, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_8x8_internal_mmx2 lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_8x8_internal_mmx2 SATD_END_MMX %endif ; !HIGH_BIT_DEPTH cglobal pixel_satd_8x8, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_8x8_internal_mmx2 SATD_END_MMX cglobal pixel_satd_8x4, 4,6 SATD_START_MMX pxor m0, m0 call pixel_satd_8x4_internal_mmx2 SATD_END_MMX cglobal pixel_satd_4x16, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 1 paddw m0, m1 SATD_4x4_MMX m1, 0, 1 paddw m0, m1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX cglobal pixel_satd_4x8, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 1 SATD_4x4_MMX m1, 0, 0 paddw m0, m1 SATD_END_MMX cglobal pixel_satd_4x4, 4,6 SATD_START_MMX SATD_4x4_MMX m0, 0, 0 SATD_END_MMX %macro SATD_START_SSE2 2-3 0 FIX_STRIDES r1, r3 %if HIGH_BIT_DEPTH && %3 pxor %2, %2 %elif cpuflag(ssse3) && notcpuflag(atom) %if mmsize==32 mova %2, [hmul_16p] %else mova %2, [hmul_8p] %endif %endif lea r4, [3*r1] lea r5, [3*r3] pxor %1, %1 %endmacro %macro SATD_END_SSE2 1-2 %if HIGH_BIT_DEPTH HADDUW %1, xm0 %if %0 == 2 paddd %1, %2 %endif %else HADDW %1, xm7 %endif movd eax, %1 RET %endmacro %macro SATD_ACCUM 3 %if HIGH_BIT_DEPTH HADDUW %1, %2 paddd %3, %1 pxor %1, %1 %endif %endmacro %macro BACKUP_POINTERS 0 %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r6, r0 mov r7, r2 %endif %endmacro %macro RESTORE_AND_INC_POINTERS 0 %if ARCH_X86_64 lea r0, [r6+8*SIZEOF_PIXEL] lea r2, [r7+8*SIZEOF_PIXEL] %if WIN64 POP r7 %endif %else mov r0, r0mp mov r2, r2mp add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL %endif %endmacro %macro SATD_4x8_SSE 3 %if HIGH_BIT_DEPTH movh m0, [r0+0*r1] movh m4, [r2+0*r3] movh m1, [r0+1*r1] movh m5, [r2+1*r3] movhps m0, [r0+4*r1] movhps m4, [r2+4*r3] movh m2, [r0+2*r1] movh m6, [r2+2*r3] psubw m0, m4 movh m3, [r0+r4] movh m4, [r2+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] movhps m1, [r0+1*r1] movhps m5, [r2+1*r3] movhps m2, [r0+2*r1] movhps m6, [r2+2*r3] psubw m1, m5 movhps m3, [r0+r4] movhps m4, [r2+r5] psubw m2, m6 psubw m3, m4 %else ; !HIGH_BIT_DEPTH movd m4, [r2] movd m5, [r2+r3] movd m6, [r2+2*r3] add r2, r5 movd m0, [r0] movd m1, [r0+r1] movd m2, [r0+2*r1] add r0, r4 movd m3, [r2+r3] JDUP m4, m3 movd m3, [r0+r1] JDUP m0, m3 movd m3, [r2+2*r3] JDUP m5, m3 movd m3, [r0+2*r1] JDUP m1, m3 %if %1==0 && %2==1 mova m3, [hmul_4p] DIFFOP 0, 4, 1, 5, 3 %else DIFFOP 0, 4, 1, 5, 7 %endif movd m5, [r2] add r2, r5 movd m3, [r0] add r0, r4 movd m4, [r2] JDUP m6, m4 movd m4, [r0] JDUP m2, m4 movd m4, [r2+r3] JDUP m5, m4 movd m4, [r0+r1] JDUP m3, m4 %if %1==0 && %2==1 mova m4, [hmul_4p] DIFFOP 2, 6, 3, 5, 4 %else DIFFOP 2, 6, 3, 5, 7 %endif %endif ; HIGH_BIT_DEPTH SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3 %endmacro ;----------------------------------------------------------------------------- ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH) cglobal pixel_satd_4x4, 4, 6, 6 SATD_START_MMX mova m4, [hmul_4p] LOAD_DUP_2x4P m2, m5, [r2], [r2+r3] LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5] LOAD_DUP_2x4P m0, m5, [r0], [r0+r1] LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4] DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 HADAMARD 0, sumsub, 0, 1, 2, 3 HADAMARD 4, sumsub, 0, 1, 2, 3 HADAMARD 1, amax, 0, 1, 2, 3 HADDW m0, m1 movd eax, m0 RET %endif cglobal pixel_satd_4x8, 4, 6, 8 SATD_START_MMX %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap HADDW m7, m1 movd eax, m7 RET cglobal pixel_satd_4x16, 4, 6, 8 SATD_START_MMX %if vertical==0 mova m7, [hmul_4p] %endif SATD_4x8_SSE vertical, 0, swap lea r0, [r0+r1*2*SIZEOF_PIXEL] lea r2, [r2+r3*2*SIZEOF_PIXEL] SATD_4x8_SSE vertical, 1, add HADDW m7, m1 movd eax, m7 RET cglobal pixel_satd_8x8_internal LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 %%pixel_satd_8x4_internal: LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0 SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6 ret ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers) ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge) %if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx) cglobal pixel_satd_16x4_internal LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11 lea r2, [r2+4*r3] lea r0, [r0+4*r1] ; always use horizontal mode here SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10 SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10 ret cglobal pixel_satd_16x8, 4,6,12 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif jmp %%pixel_satd_16x8_internal cglobal pixel_satd_16x16, 4,6,12 SATD_START_SSE2 m10, m7 %if vertical mova m7, [pw_00ff] %endif call pixel_satd_16x4_internal call pixel_satd_16x4_internal %%pixel_satd_16x8_internal: call pixel_satd_16x4_internal call pixel_satd_16x4_internal SATD_END_SSE2 m10 %else cglobal pixel_satd_16x8, 4,6,8 SATD_START_SSE2 m6, m7 BACKUP_POINTERS call pixel_satd_8x8_internal RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal SATD_END_SSE2 m6 cglobal pixel_satd_16x16, 4,6,8 SATD_START_SSE2 m6, m7, 1 BACKUP_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal SATD_ACCUM m6, m0, m7 RESTORE_AND_INC_POINTERS call pixel_satd_8x8_internal call pixel_satd_8x8_internal SATD_END_SSE2 m6, m7 %endif cglobal pixel_satd_8x16, 4,6,8 SATD_START_SSE2 m6, m7 call pixel_satd_8x8_internal call pixel_satd_8x8_internal SATD_END_SSE2 m6 cglobal pixel_satd_8x8, 4,6,8 SATD_START_SSE2 m6, m7 call pixel_satd_8x8_internal SATD_END_SSE2 m6 cglobal pixel_satd_8x4, 4,6,8 SATD_START_SSE2 m6, m7 call %%pixel_satd_8x4_internal SATD_END_SSE2 m6 %endmacro ; SATDS_SSE2 %macro SA8D_INTER 0 %if ARCH_X86_64 %define lh m10 %define rh m0 %else %define lh m0 %define rh [esp+48] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 paddd lh, rh %else paddusw lh, rh %endif ; HIGH_BIT_DEPTH %endmacro %macro SA8D 0 ; sse2 doesn't seem to like the horizontal way of doing things %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal lea r6, [r0+4*r1] lea r7, [r2+4*r3] LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2 LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7 %if vertical HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax %else ; non-sse2 HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11 %endif paddw m0, m1 paddw m0, m2 paddw m0, m8 SAVE_MM_PERMUTATION ret cglobal pixel_sa8d_8x8, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] %if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 RET cglobal pixel_sa8d_16x16, 4,8,12 FIX_STRIDES r1, r3 lea r4, [3*r1] lea r5, [3*r3] %if vertical == 0 mova m7, [hmul_8p] %endif call pixel_sa8d_8x8_internal ; pix[0] add r2, 8*SIZEOF_PIXEL add r0, 8*SIZEOF_PIXEL %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova m10, m0 call pixel_sa8d_8x8_internal ; pix[8] lea r2, [r2+8*r3] lea r0, [r0+8*r1] SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride+8] sub r2, 8*SIZEOF_PIXEL sub r0, 8*SIZEOF_PIXEL SA8D_INTER call pixel_sa8d_8x8_internal ; pix[8*stride] SA8D_INTER SWAP 0, 10 %if HIGH_BIT_DEPTH == 0 HADDUW m0, m1 %endif movd eax, m0 add eax, 1 shr eax, 1 RET %else ; ARCH_X86_32 %if mmsize == 16 cglobal pixel_sa8d_8x8_internal %define spill0 [esp+4] %define spill1 [esp+20] %define spill2 [esp+36] %if vertical LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1 HADAMARD4_2D 0, 1, 2, 3, 4 movdqa spill0, m3 LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1 HADAMARD4_2D 4, 5, 6, 7, 3 HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax movdqa m3, spill0 paddw m0, m1 HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax %else ; mmsize == 8 mova m7, [hmul_8p] LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1 ; could do first HADAMARD4_V here to save spilling later ; surprisingly, not a win on conroe or even p4 mova spill0, m2 mova spill1, m3 mova spill2, m1 SWAP 1, 7 LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1 HADAMARD4_V 4, 5, 6, 7, 3 mova m1, spill2 mova m2, spill0 mova m3, spill1 mova spill0, m6 mova spill1, m7 HADAMARD4_V 0, 1, 2, 3, 7 SUMSUB_BADC w, 0, 4, 1, 5, 7 HADAMARD 2, sumsub, 0, 4, 7, 6 HADAMARD 2, sumsub, 1, 5, 7, 6 HADAMARD 1, amax, 0, 4, 7, 6 HADAMARD 1, amax, 1, 5, 7, 6 mova m6, spill0 mova m7, spill1 paddw m0, m1 SUMSUB_BADC w, 2, 6, 3, 7, 4 HADAMARD 2, sumsub, 2, 6, 4, 5 HADAMARD 2, sumsub, 3, 7, 4, 5 HADAMARD 1, amax, 2, 6, 4, 5 HADAMARD 1, amax, 3, 7, 4, 5 %endif ; sse2/non-sse2 paddw m0, m2 paddw m0, m3 SAVE_MM_PERMUTATION ret %endif ; ifndef mmx2 cglobal pixel_sa8d_8x8, 4,7 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 48 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH HADDUW m0, m1 %else HADDW m0, m1 %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET cglobal pixel_sa8d_16x16, 4,7 FIX_STRIDES r1, r3 mov r6, esp and esp, ~15 sub esp, 64 lea r4, [3*r1] lea r5, [3*r3] call pixel_sa8d_8x8_internal %if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif %if HIGH_BIT_DEPTH HADDUW m0, m1 %endif mova [esp+48], m0 call pixel_sa8d_8x8_internal mov r0, [r6+20] mov r2, [r6+28] add r0, 8*SIZEOF_PIXEL add r2, 8*SIZEOF_PIXEL SA8D_INTER mova [esp+48], m0 call pixel_sa8d_8x8_internal %if mmsize == 8 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %else SA8D_INTER %endif mova [esp+64-mmsize], m0 call pixel_sa8d_8x8_internal %if HIGH_BIT_DEPTH SA8D_INTER %else ; !HIGH_BIT_DEPTH paddusw m0, [esp+64-mmsize] %if mmsize == 16 HADDUW m0, m1 %else mova m2, [esp+48] pxor m7, m7 mova m1, m0 mova m3, m2 punpcklwd m0, m7 punpckhwd m1, m7 punpcklwd m2, m7 punpckhwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m2 HADDD m0, m1 %endif %endif ; HIGH_BIT_DEPTH movd eax, m0 add eax, 1 shr eax, 1 mov esp, r6 RET %endif ; !ARCH_X86_64 %endmacro ; SA8D ;============================================================================= ; SA8D_SATD ;============================================================================= ; %1: vertical/horizontal mode ; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9) ; m10: satd result ; m6, m11-15: tmp regs %macro SA8D_SATD_8x4 5 %if %1 LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 HADAMARD 0, sumsub, %2, %3, 6 HADAMARD 0, sumsub, %4, %5, 6 SBUTTERFLY wd, %2, %3, 6 SBUTTERFLY wd, %4, %5, 6 HADAMARD2_2D %2, %4, %3, %5, 6, dq mova m12, m%2 mova m13, m%3 mova m14, m%4 mova m15, m%5 HADAMARD 0, sumsub, %2, %3, 6 HADAMARD 0, sumsub, %4, %5, 6 SBUTTERFLY qdq, 12, 13, 6 HADAMARD 0, amax, 12, 13, 6 SBUTTERFLY qdq, 14, 15, 6 paddw m10, m12 HADAMARD 0, amax, 14, 15, 6 paddw m10, m14 %else LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1 HADAMARD4_V %2, %3, %4, %5, 6 pabsw m12, m%2 ; doing the abs first is a slight advantage pabsw m14, m%4 pabsw m13, m%3 pabsw m15, m%5 HADAMARD 1, max, 12, 14, 6, 11 paddw m10, m12 HADAMARD 1, max, 13, 15, 6, 11 paddw m10, m13 %endif %endmacro ; SA8D_SATD_8x4 ; %1: add spilled regs? ; %2: spill regs? %macro SA8D_SATD_ACCUM 2 %if HIGH_BIT_DEPTH pmaddwd m10, [pw_1] HADDUWD m0, m1 %if %1 paddd m10, temp1 paddd m0, temp0 %endif %if %2 mova temp1, m10 pxor m10, m10 %endif %elif %1 paddw m0, temp0 %endif %if %2 mova temp0, m0 %endif %endmacro %macro SA8D_SATD 0 %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH) cglobal pixel_sa8d_satd_8x8_internal SA8D_SATD_8x4 vertical, 0, 1, 2, 3 SA8D_SATD_8x4 vertical, 4, 5, 8, 9 %if vertical ; sse2-style HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax %else ; complete sa8d SUMSUB_BADC w, 0, 4, 1, 5, 12 HADAMARD 2, sumsub, 0, 4, 12, 11 HADAMARD 2, sumsub, 1, 5, 12, 11 SUMSUB_BADC w, 2, 8, 3, 9, 12 HADAMARD 2, sumsub, 2, 8, 12, 11 HADAMARD 2, sumsub, 3, 9, 12, 11 HADAMARD 1, amax, 0, 4, 12, 11 HADAMARD 1, amax, 1, 5, 12, 4 HADAMARD 1, amax, 2, 8, 12, 4 HADAMARD 1, amax, 3, 9, 12, 4 %endif ; create sa8d sub results paddw m1, m2 paddw m0, m3 paddw m0, m1 SAVE_MM_PERMUTATION ret ;------------------------------------------------------------------------------- ; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t ) ;------------------------------------------------------------------------------- cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize %define temp0 [rsp+0*mmsize] %define temp1 [rsp+1*mmsize] FIX_STRIDES r1, r3 %if vertical==0 mova m7, [hmul_8p] %endif lea r4, [3*r1] lea r5, [3*r3] pxor m10, m10 %if mmsize==32 call pixel_sa8d_satd_8x8_internal SA8D_SATD_ACCUM 0, 1 call pixel_sa8d_satd_8x8_internal SA8D_SATD_ACCUM 1, 0 vextracti128 xm1, m0, 1 vextracti128 xm2, m10, 1 paddw xm0, xm1 paddw xm10, xm2 %else lea r6, [r2+8*SIZEOF_PIXEL] lea r7, [r0+8*SIZEOF_PIXEL] call pixel_sa8d_satd_8x8_internal SA8D_SATD_ACCUM 0, 1 call pixel_sa8d_satd_8x8_internal SA8D_SATD_ACCUM 1, 1 mov r0, r7 mov r2, r6 call pixel_sa8d_satd_8x8_internal SA8D_SATD_ACCUM 1, 1 call pixel_sa8d_satd_8x8_internal SA8D_SATD_ACCUM 1, 0 %endif ; xop already has fast horizontal sums %if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0 pmaddwd xm10, [pw_1] HADDUWD xm0, xm1 phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2 pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1 paddd xm0, xm1 ; sa8d sa8d satd satd movd r0d, xm0 pextrd eax, xm0, 2 %else %if HIGH_BIT_DEPTH HADDD xm0, xm1 HADDD xm10, xm2 %else HADDUW xm0, xm1 HADDW xm10, xm2 %endif movd r0d, xm0 movd eax, xm10 %endif add r0d, 1 shl rax, 32 shr r0d, 1 or rax, r0 RET %endmacro ; SA8D_SATD ;============================================================================= ; INTRA SATD ;============================================================================= %macro HSUMSUB2 8 pshufd %4, %2, %7 pshufd %5, %3, %7 %1 %2, %8 %1 %6, %8 paddw %2, %4 paddw %3, %5 %endmacro ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+, ; and are only retained for old cpus. %macro INTRA_SA8D_SSE2 0 %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) ;----------------------------------------------------------------------------- cglobal intra_sa8d_x3_8x8, 3,3,13 ; 8x8 hadamard pxor m8, m8 movq m0, [r0+0*FENC_STRIDE] movq m1, [r0+1*FENC_STRIDE] movq m2, [r0+2*FENC_STRIDE] movq m3, [r0+3*FENC_STRIDE] movq m4, [r0+4*FENC_STRIDE] movq m5, [r0+5*FENC_STRIDE] movq m6, [r0+6*FENC_STRIDE] movq m7, [r0+7*FENC_STRIDE] punpcklbw m0, m8 punpcklbw m1, m8 punpcklbw m2, m8 punpcklbw m3, m8 punpcklbw m4, m8 punpcklbw m5, m8 punpcklbw m6, m8 punpcklbw m7, m8 HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 ABSW2 m8, m9, m2, m3, m2, m3 ABSW2 m10, m11, m4, m5, m4, m5 paddw m8, m10 paddw m9, m11 ABSW2 m10, m11, m6, m7, m6, m7 ABSW m12, m1, m1 paddw m10, m11 paddw m8, m9 paddw m12, m10 paddw m12, m8 ; 1D hadamard of edges movq m8, [r1+7] movq m9, [r1+16] pxor m10, m10 punpcklbw m8, m10 punpcklbw m9, m10 HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm] HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm] pshuflw m10, m8, q2301 pshuflw m11, m9, q2301 pshufhw m10, m10, q2301 pshufhw m11, m11, q2301 pmullw m8, [pw_pmpmpmpm] pmullw m11, [pw_pmpmpmpm] paddw m8, m10 paddw m9, m11 ; differences paddw m10, m8, m9 paddw m10, [pw_8] pand m10, [sw_f0] psllw m8, 3 ; left edge psllw m10, 2 ; dc psubw m8, m0 psubw m10, m0 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 punpcklwd m6, m7 ABSW m10, m10, m1 paddw m10, m12 punpckldq m0, m2 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose psllw m9, 3 ; top edge psrldq m2, m10, 2 ; 8x7 sum psubw m0, m9 ; 8x1 sum ABSW2 m8, m0, m8, m0, m1, m3 ; 1x8 sum paddw m8, m12 paddusw m2, m0 ; 3x HADDW mova m7, [pd_f0] pandn m0, m7, m10 psrld m10, 16 pandn m1, m7, m8 psrld m8, 16 pandn m7, m2 psrld m2, 16 paddd m0, m10 paddd m1, m8 paddd m2, m7 pshufd m3, m0, q2301 punpckhdq m4, m2, m1 punpckldq m2, m1 paddd m3, m0 paddd m2, m4 punpckhqdq m0, m2, m3 punpcklqdq m2, m3 paddd m0, [pd_2] paddd m0, m2 psrld m0, 2 mova [r2], m0 RET %endif ; ARCH_X86_64 %endmacro ; INTRA_SA8D_SSE2 ; in: r0 = fenc ; out: m0..m3 = hadamard coefs INIT_MMX cglobal hadamard_load ; not really a global, but otherwise cycles get attributed to the wrong function in profiling %if HIGH_BIT_DEPTH mova m0, [r0+0*FENC_STRIDEB] mova m1, [r0+1*FENC_STRIDEB] mova m2, [r0+2*FENC_STRIDEB] mova m3, [r0+3*FENC_STRIDEB] %else pxor m7, m7 movd m0, [r0+0*FENC_STRIDE] movd m1, [r0+1*FENC_STRIDE] movd m2, [r0+2*FENC_STRIDE] movd m3, [r0+3*FENC_STRIDE] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 %endif HADAMARD4_2D 0, 1, 2, 3, 4 SAVE_MM_PERMUTATION ret %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp %ifidn %1, top %if HIGH_BIT_DEPTH mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] %else movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB] pxor %5, %5 punpcklbw %3, %5 %endif %else ; left %ifnidn %2, 0 shl %2d, 5 ; log(FDEC_STRIDEB) %endif movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB] pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2 pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3 %if HIGH_BIT_DEPTH == 0 psrlw %3, 8 %endif %ifnidn %2, 0 shr %2d, 5 %endif %endif ; direction %if cpuflag(ssse3) %define %%sign psignw %else %define %%sign pmullw %endif pshufw %4, %3, q1032 %%sign %4, [pw_ppmmppmm] paddw %3, %4 pshufw %4, %3, q2301 %%sign %4, [pw_pmpmpmpm] paddw %3, %4 psllw %3, 2 mova [%1_1d+2*%2], %3 %endmacro %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, q1032 pshufw %5, %2, q1032 pshufw %6, %3, q1032 paddw %1, %4 paddw %2, %5 paddw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, q1032 pshufw %5, %2, q1032 pshufw %6, %3, q1032 %8 %1, %4 %8 %2, %5 %8 %3, %6 %endmacro ; in: m1..m3 ; out: m7 ; clobber: m4..m6 %macro SUM3x4 0 ABSW2 m4, m5, m1, m2, m1, m2 ABSW m7, m3, m3 paddw m4, m5 paddw m7, m4 %endmacro ; in: m0..m3 (4x4) ; out: m0 v, m4 h, m5 dc ; clobber: m1..m3 %macro SUM4x3 3 ; dc, left, top movq m4, %2 %ifnum sizeof%1 movq m5, %1 %else movd m5, %1 %endif psubw m4, m0 psubw m5, m0 punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ; transpose psubw m0, %3 ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum ABSW m0, m0, m1 ; 4x1 sum %endmacro %macro INTRA_X3_MMX 0 ;----------------------------------------------------------------------------- ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_4x4, 3,3 %if UNIX64 ; stack is 16 byte aligned because abi says so %define top_1d rsp-8 ; size 8 %define left_1d rsp-16 ; size 8 %else ; WIN64: stack is 16 byte aligned because abi says so ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned SUB rsp, 16 %define top_1d rsp+8 %define left_1d rsp %endif call hadamard_load SCALAR_HADAMARD left, 0, m4, m5 SCALAR_HADAMARD top, 0, m6, m5, m7 paddw m6, m4 pavgw m6, [pw_16] pand m6, [sw_f0] ; dc SUM3x4 SUM4x3 m6, [left_1d], [top_1d] paddw m4, m7 paddw m5, m7 movq m1, m5 psrlq m1, 16 ; 4x3 sum paddw m0, m1 SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw movd [r2+0], m0 ; i4x4_v satd movd [r2+4], m4 ; i4x4_h satd movd [r2+8], m5 ; i4x4_dc satd %if UNIX64 == 0 ADD rsp, 16 %endif RET ;----------------------------------------------------------------------------- ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_16x16, 0,5 %assign stack_pad 120 + ((stack_offset+120+gprsize)&15) ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, stack_pad %define sums rsp+64 ; size 56 %define top_1d rsp+32 ; size 32 %define left_1d rsp ; size 32 movifnidn r1, r1mp pxor m7, m7 mova [sums+ 0], m7 mova [sums+ 8], m7 mova [sums+16], m7 %if HIGH_BIT_DEPTH mova [sums+24], m7 mova [sums+32], m7 mova [sums+40], m7 mova [sums+48], m7 %endif ; 1D hadamards mov r3d, 12 movd m6, [pw_32] .loop_edge: SCALAR_HADAMARD left, r3, m0, m1 SCALAR_HADAMARD top, r3, m1, m2, m3 pavgw m0, m1 paddw m6, m0 sub r3d, 4 jge .loop_edge psrlw m6, 2 pand m6, [sw_f0] ; dc ; 2D hadamards movifnidn r0, r0mp mov r3, -4 .loop_y: mov r4, -4 .loop_x: call hadamard_load SUM3x4 SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+ 0] ; i16x16_v satd paddw m4, [sums+ 8] ; i16x16_h satd paddw m5, [sums+16] ; i16x16_dc satd mova [sums+ 0], m0 mova [sums+ 8], m4 mova [sums+16], m5 add r0, 4*SIZEOF_PIXEL inc r4 jl .loop_x %if HIGH_BIT_DEPTH psrld m7, m4, 16 pslld m4, 16 psrld m4, 16 paddd m4, m7 psrld m7, m0, 16 pslld m0, 16 psrld m0, 16 paddd m0, m7 paddd m4, [sums+32] paddd m0, [sums+24] mova [sums+32], m4 mova [sums+24], m0 pxor m7, m7 punpckhwd m3, m5, m7 punpcklwd m5, m7 paddd m3, [sums+48] paddd m5, [sums+40] mova [sums+48], m3 mova [sums+40], m5 mova [sums+ 0], m7 mova [sums+ 8], m7 mova [sums+16], m7 %endif add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL inc r3 jl .loop_y ; horizontal sum movifnidn r2, r2mp %if HIGH_BIT_DEPTH mova m1, m5 paddd m5, m3 HADDD m5, m7 ; DC satd HADDD m4, m7 ; H satd HADDD m0, m7 ; the part of V satd that doesn't overlap with DC psrld m0, 1 psrlq m1, 32 ; DC[1] paddd m0, m3 ; DC[2] psrlq m3, 32 ; DC[3] paddd m0, m1 paddd m0, m3 %else mova m7, m5 SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd psrld m0, 1 pslld m7, 16 psrld m7, 16 paddd m0, m5 psubd m0, m7 %endif movd [r2+8], m5 ; i16x16_dc satd movd [r2+4], m4 ; i16x16_h satd movd [r2+0], m0 ; i16x16_v satd ADD rsp, stack_pad RET %if ARCH_X86_64 %define t0 r6 %else %define t0 r2 %endif ;----------------------------------------------------------------------------- ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- cglobal intra_satd_x3_8x8c, 0,6 ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call SUB rsp, 72 %define sums rsp+48 ; size 24 %define dc_1d rsp+32 ; size 16 %define top_1d rsp+16 ; size 16 %define left_1d rsp ; size 16 movifnidn r1, r1mp pxor m7, m7 mova [sums+ 0], m7 mova [sums+ 8], m7 mova [sums+16], m7 ; 1D hadamards mov r3d, 4 .loop_edge: SCALAR_HADAMARD left, r3, m0, m1 SCALAR_HADAMARD top, r3, m0, m1, m2 sub r3d, 4 jge .loop_edge ; dc movzx t0d, word [left_1d+0] movzx r3d, word [top_1d+0] movzx r4d, word [left_1d+8] movzx r5d, word [top_1d+8] lea t0d, [t0 + r3 + 16] lea r3d, [r4 + r5 + 16] shr t0d, 1 shr r3d, 1 add r4d, 8 add r5d, 8 and t0d, -16 ; tl and r3d, -16 ; br and r4d, -16 ; bl and r5d, -16 ; tr mov [dc_1d+ 0], t0d ; tl mov [dc_1d+ 4], r5d ; tr mov [dc_1d+ 8], r4d ; bl mov [dc_1d+12], r3d ; br lea r5, [dc_1d] ; 2D hadamards movifnidn r0, r0mp movifnidn r2, r2mp mov r3, -2 .loop_y: mov r4, -2 .loop_x: call hadamard_load SUM3x4 SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)] pavgw m4, m7 pavgw m5, m7 paddw m0, [sums+16] ; i4x4_v satd paddw m4, [sums+8] ; i4x4_h satd paddw m5, [sums+0] ; i4x4_dc satd movq [sums+16], m0 movq [sums+8], m4 movq [sums+0], m5 add r0, 4*SIZEOF_PIXEL inc r4 jl .loop_x add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL add r5, 8 inc r3 jl .loop_y ; horizontal sum movq m0, [sums+0] movq m1, [sums+8] movq m2, [sums+16] movq m7, m0 %if HIGH_BIT_DEPTH psrlq m7, 16 HADDW m7, m3 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 paddd m2, m7 %else psrlq m7, 15 paddw m2, m7 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd psrld m2, 1 %endif movd [r2+0], m0 ; i8x8c_dc satd movd [r2+4], m1 ; i8x8c_h satd movd [r2+8], m2 ; i8x8c_v satd ADD rsp, 72 RET %endmacro ; INTRA_X3_MMX %macro PRED4x4_LOWPASS 5 %ifnum sizeof%5 pavgb %5, %2, %3 pxor %3, %2 pand %3, [pb_1] psubusb %5, %3 pavgb %1, %4, %5 %else mova %5, %2 pavgb %2, %3 pxor %3, %5 pand %3, [pb_1] psubusb %2, %3 pavgb %1, %4, %2 %endif %endmacro %macro INTRA_X9_PRED 2 %if cpuflag(sse4) movu m1, [r1-1*FDEC_STRIDE-8] pinsrb m1, [r1+3*FDEC_STRIDE-1], 0 pinsrb m1, [r1+2*FDEC_STRIDE-1], 1 pinsrb m1, [r1+1*FDEC_STRIDE-1], 2 pinsrb m1, [r1+0*FDEC_STRIDE-1], 3 %else movd mm0, [r1+3*FDEC_STRIDE-4] punpcklbw mm0, [r1+2*FDEC_STRIDE-4] movd mm1, [r1+1*FDEC_STRIDE-4] punpcklbw mm1, [r1+0*FDEC_STRIDE-4] punpckhwd mm0, mm1 psrlq mm0, 32 movq2dq m0, mm0 movu m1, [r1-1*FDEC_STRIDE-8] movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7 %endif ; cpuflag pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __ pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __ mova %2, m1 PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __ ; ddl ddr ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2 ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1 ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0 ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1 pshufb m3, m0, [%1_ddlr2] ; rows 2,3 ; hd hu ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2 ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3 ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3 ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3 pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt pshufb m6, m7, [%1_hdu1] pshufb m7, m7, [%1_hdu2] ; vr vl ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4 ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4 ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ... palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 pshufb m4, m5, [%1_vrl1] pshufb m5, m5, [%1_vrl2] %endmacro ; INTRA_X9_PRED %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp pshufb m2, m%1, [intrax9b_vh1] pshufb m3, m%1, [intrax9b_vh2] mova [pred_buf+0x60], m2 mova [pred_buf+0x70], m3 pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3 pmaddubsw m%1, [hmul_4p] pshufhw m0, m%1, q2301 pshuflw m0, m0, q2301 psignw m%1, [pw_pmpmpmpm] paddw m0, m%1 psllw m0, 2 ; hadamard(top), hadamard(left) MOVHL m3, m0 pshufb m1, m0, [intrax9b_v1] pshufb m2, m0, [intrax9b_v2] paddw m0, m3 psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated? pavgw m0, [pw_16] pand m0, [sw_f0] ; dc ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs. ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef. HADAMARD 0, sumsub, %2, %3, %4, %5 HADAMARD 1, sumsub, %2, %3, %4, %5 movd r3d, m0 shr r3d, 4 imul r3d, 0x01010101 mov [pred_buf+0x80], r3d mov [pred_buf+0x88], r3d mov [pred_buf+0x90], r3d mov [pred_buf+0x98], r3d psubw m3, m%2 psubw m0, m%2 psubw m1, m%2 psubw m2, m%3 pabsw m%3, m%3 pabsw m3, m3 pabsw m0, m0 pabsw m1, m1 pabsw m2, m2 pavgw m3, m%3 pavgw m0, m%3 pavgw m1, m2 %if cpuflag(sse4) phaddw m3, m0 %else SBUTTERFLY qdq, 3, 0, 2 paddw m3, m0 %endif MOVHL m2, m1 paddw m1, m2 %if cpuflag(xop) vphaddwq m3, m3 vphaddwq m1, m1 packssdw m1, m3 %else phaddw m1, m3 pmaddwd m1, [pw_1] ; v, _, h, dc %endif %endmacro ; INTRA_X9_VHDC %macro INTRA_X9_END 2 %if cpuflag(sse4) phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu movd eax, m0 add eax, 1<<16 cmp ax, r3w cmovge eax, r3d %else %if %1 ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index psllw m0, 3 paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu %else ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index psllw m0, 2 paddusw m0, m0 paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu %endif movhlps m1, m0 pminsw m0, m1 pshuflw m1, m0, q0032 pminsw m0, m1 pshuflw m1, m0, q0001 pminsw m0, m1 movd eax, m0 movsx r2d, ax and eax, 7 sar r2d, 3 shl eax, 16 ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits ; 1<<12: undo sign manipulation lea eax, [rax+r2+(1<<16)+(1<<12)] cmp ax, r3w cmovge eax, r3d %endif ; cpuflag ; output the predicted samples mov r3d, eax shr r3d, 16 %if ARCH_X86_64 lea r2, [%2_lut] movzx r2d, byte [r2+r3] %else movzx r2d, byte [%2_lut+r3] %endif %if %1 ; sad movq mm0, [pred_buf+r2] movq mm1, [pred_buf+r2+16] movd [r1+0*FDEC_STRIDE], mm0 movd [r1+2*FDEC_STRIDE], mm1 psrlq mm0, 32 psrlq mm1, 32 movd [r1+1*FDEC_STRIDE], mm0 movd [r1+3*FDEC_STRIDE], mm1 %else ; satd %assign i 0 %rep 4 mov r3d, [pred_buf+r2+8*i] mov [r1+i*FDEC_STRIDE], r3d %assign i i+1 %endrep %endif %endmacro ; INTRA_X9_END %macro INTRA_X9 0 ;----------------------------------------------------------------------------- ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) ;----------------------------------------------------------------------------- %if notcpuflag(xop) cglobal intra_sad_x9_4x4, 3,4,9 %assign pad 0xc0-gprsize-(stack_offset&15) %define pred_buf rsp sub rsp, pad %if ARCH_X86_64 INTRA_X9_PRED intrax9a, m8 %else INTRA_X9_PRED intrax9a, [rsp+0xa0] %endif mova [rsp+0x00], m2 mova [rsp+0x10], m3 mova [rsp+0x20], m4 mova [rsp+0x30], m5 mova [rsp+0x40], m6 mova [rsp+0x50], m7 %if cpuflag(sse4) movd m0, [r0+0*FENC_STRIDE] pinsrd m0, [r0+1*FENC_STRIDE], 1 movd m1, [r0+2*FENC_STRIDE] pinsrd m1, [r0+3*FENC_STRIDE], 1 %else movd mm0, [r0+0*FENC_STRIDE] punpckldq mm0, [r0+1*FENC_STRIDE] movd mm1, [r0+2*FENC_STRIDE] punpckldq mm1, [r0+3*FENC_STRIDE] movq2dq m0, mm0 movq2dq m1, mm1 %endif punpcklqdq m0, m0 punpcklqdq m1, m1 psadbw m2, m0 psadbw m3, m1 psadbw m4, m0 psadbw m5, m1 psadbw m6, m0 psadbw m7, m1 paddd m2, m3 paddd m4, m5 paddd m6, m7 %if ARCH_X86_64 SWAP 7, 8 pxor m8, m8 %define %%zero m8 %else mova m7, [rsp+0xa0] %define %%zero [pb_0] %endif pshufb m3, m7, [intrax9a_vh1] pshufb m5, m7, [intrax9a_vh2] pshufb m7, [intrax9a_dc] psadbw m7, %%zero psrlw m7, 2 mova [rsp+0x60], m3 mova [rsp+0x70], m5 psadbw m3, m0 pavgw m7, %%zero pshufb m7, %%zero psadbw m5, m1 movq [rsp+0x80], m7 movq [rsp+0x90], m7 psadbw m0, m7 paddd m3, m5 psadbw m1, m7 paddd m0, m1 movzx r3d, word [r2] movd r0d, m3 ; v add r3d, r0d punpckhqdq m3, m0 ; h, dc shufps m3, m2, q2020 psllq m6, 32 por m4, m6 movu m0, [r2+2] packssdw m3, m4 paddw m0, m3 INTRA_X9_END 1, intrax9a add rsp, pad RET %endif ; cpuflag %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts ) ;----------------------------------------------------------------------------- cglobal intra_satd_x9_4x4, 3,4,16 %assign pad 0xb0-gprsize-(stack_offset&15) %define pred_buf rsp sub rsp, pad INTRA_X9_PRED intrax9b, m15 mova [rsp+0x00], m2 mova [rsp+0x10], m3 mova [rsp+0x20], m4 mova [rsp+0x30], m5 mova [rsp+0x40], m6 mova [rsp+0x50], m7 movd m8, [r0+0*FENC_STRIDE] movd m9, [r0+1*FENC_STRIDE] movd m10, [r0+2*FENC_STRIDE] movd m11, [r0+3*FENC_STRIDE] mova m12, [hmul_8p] pshufd m8, m8, 0 pshufd m9, m9, 0 pshufd m10, m10, 0 pshufd m11, m11, 0 pmaddubsw m8, m12 pmaddubsw m9, m12 pmaddubsw m10, m12 pmaddubsw m11, m12 movddup m0, m2 pshufd m1, m2, q3232 movddup m2, m3 punpckhqdq m3, m3 call .satd_8x4 ; ddr, ddl movddup m2, m5 pshufd m3, m5, q3232 mova m5, m0 movddup m0, m4 pshufd m1, m4, q3232 call .satd_8x4 ; vr, vl movddup m2, m7 pshufd m3, m7, q3232 mova m4, m0 movddup m0, m6 pshufd m1, m6, q3232 call .satd_8x4 ; hd, hu %if cpuflag(sse4) punpckldq m4, m0 %else punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't %endif mova m1, [pw_ppmmppmm] psignw m8, m1 psignw m10, m1 paddw m8, m9 paddw m10, m11 INTRA_X9_VHDC 15, 8, 10, 6, 7 ; find minimum movu m0, [r2+2] movd r3d, m1 palignr m5, m1, 8 %if notcpuflag(sse4) pshufhw m0, m0, q3120 ; compensate for different order in unpack %endif packssdw m5, m4 paddw m0, m5 movzx r0d, word [r2] add r3d, r0d INTRA_X9_END 0, intrax9b add rsp, pad RET RESET_MM_PERMUTATION ALIGN 16 .satd_8x4: pmaddubsw m0, m12 pmaddubsw m1, m12 pmaddubsw m2, m12 pmaddubsw m3, m12 psubw m0, m8 psubw m1, m9 psubw m2, m10 psubw m3, m11 SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap pmaddwd m0, [pw_1] MOVHL m1, m0 paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free ret %else ; !ARCH_X86_64 cglobal intra_satd_x9_4x4, 3,4,8 %assign pad 0x120-gprsize-(stack_offset&15) %define fenc_buf rsp %define pred_buf rsp+0x40 %define spill rsp+0xe0 sub rsp, pad INTRA_X9_PRED intrax9b, [spill+0x20] mova [pred_buf+0x00], m2 mova [pred_buf+0x10], m3 mova [pred_buf+0x20], m4 mova [pred_buf+0x30], m5 mova [pred_buf+0x40], m6 mova [pred_buf+0x50], m7 movd m4, [r0+0*FENC_STRIDE] movd m5, [r0+1*FENC_STRIDE] movd m6, [r0+2*FENC_STRIDE] movd m0, [r0+3*FENC_STRIDE] mova m7, [hmul_8p] pshufd m4, m4, 0 pshufd m5, m5, 0 pshufd m6, m6, 0 pshufd m0, m0, 0 pmaddubsw m4, m7 pmaddubsw m5, m7 pmaddubsw m6, m7 pmaddubsw m0, m7 mova [fenc_buf+0x00], m4 mova [fenc_buf+0x10], m5 mova [fenc_buf+0x20], m6 mova [fenc_buf+0x30], m0 movddup m0, m2 pshufd m1, m2, q3232 movddup m2, m3 punpckhqdq m3, m3 pmaddubsw m0, m7 pmaddubsw m1, m7 pmaddubsw m2, m7 pmaddubsw m3, m7 psubw m0, m4 psubw m1, m5 psubw m2, m6 call .satd_8x4b ; ddr, ddl mova m3, [pred_buf+0x30] mova m1, [pred_buf+0x20] movddup m2, m3 punpckhqdq m3, m3 movq [spill+0x08], m0 movddup m0, m1 punpckhqdq m1, m1 call .satd_8x4 ; vr, vl mova m3, [pred_buf+0x50] mova m1, [pred_buf+0x40] movddup m2, m3 punpckhqdq m3, m3 movq [spill+0x10], m0 movddup m0, m1 punpckhqdq m1, m1 call .satd_8x4 ; hd, hu movq [spill+0x18], m0 mova m1, [spill+0x20] mova m4, [fenc_buf+0x00] mova m5, [fenc_buf+0x20] mova m2, [pw_ppmmppmm] psignw m4, m2 psignw m5, m2 paddw m4, [fenc_buf+0x10] paddw m5, [fenc_buf+0x30] INTRA_X9_VHDC 1, 4, 5, 6, 7 ; find minimum movu m0, [r2+2] movd r3d, m1 punpckhqdq m1, [spill+0x00] packssdw m1, [spill+0x10] %if cpuflag(sse4) pshufhw m1, m1, q3120 %else pshufhw m0, m0, q3120 %endif paddw m0, m1 movzx r0d, word [r2] add r3d, r0d INTRA_X9_END 0, intrax9b add rsp, pad RET RESET_MM_PERMUTATION ALIGN 16 .satd_8x4: pmaddubsw m0, m7 pmaddubsw m1, m7 pmaddubsw m2, m7 pmaddubsw m3, m7 %xdefine fenc_buf fenc_buf+gprsize psubw m0, [fenc_buf+0x00] psubw m1, [fenc_buf+0x10] psubw m2, [fenc_buf+0x20] .satd_8x4b: psubw m3, [fenc_buf+0x30] SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap pmaddwd m0, [pw_1] MOVHL m1, m0 paddd xmm0, m0, m1 ret %endif ; ARCH %endmacro ; INTRA_X9 %macro INTRA8_X9 0 ;----------------------------------------------------------------------------- ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) ;----------------------------------------------------------------------------- cglobal intra_sad_x9_8x8, 5,6,9 %define fenc02 m4 %define fenc13 m5 %define fenc46 m6 %define fenc57 m7 %if ARCH_X86_64 %define tmp m8 %assign padbase 0x0 %else %define tmp [rsp] %assign padbase 0x10 %endif %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15) %define pred(i,j) [rsp+i*0x40+j*0x10+padbase] SUB rsp, pad movq fenc02, [r0+FENC_STRIDE* 0] movq fenc13, [r0+FENC_STRIDE* 1] movq fenc46, [r0+FENC_STRIDE* 4] movq fenc57, [r0+FENC_STRIDE* 5] movhps fenc02, [r0+FENC_STRIDE* 2] movhps fenc13, [r0+FENC_STRIDE* 3] movhps fenc46, [r0+FENC_STRIDE* 6] movhps fenc57, [r0+FENC_STRIDE* 7] ; save instruction size: avoid 4-byte memory offsets lea r0, [intra8x9_h1+128] %define off(m) (r0+m-(intra8x9_h1+128)) ; v movddup m0, [r2+16] mova pred(0,0), m0 psadbw m1, m0, fenc02 mova pred(0,1), m0 psadbw m2, m0, fenc13 mova pred(0,2), m0 psadbw m3, m0, fenc46 mova pred(0,3), m0 psadbw m0, m0, fenc57 paddw m1, m2 paddw m0, m3 paddw m0, m1 MOVHL m1, m0 paddw m0, m1 movd [r4+0], m0 ; h movq m0, [r2+7] pshufb m1, m0, [off(intra8x9_h1)] pshufb m2, m0, [off(intra8x9_h2)] mova pred(1,0), m1 psadbw m1, fenc02 mova pred(1,1), m2 psadbw m2, fenc13 paddw m1, m2 pshufb m3, m0, [off(intra8x9_h3)] pshufb m2, m0, [off(intra8x9_h4)] mova pred(1,2), m3 psadbw m3, fenc46 mova pred(1,3), m2 psadbw m2, fenc57 paddw m1, m3 paddw m1, m2 MOVHL m2, m1 paddw m1, m2 movd [r4+2], m1 lea r5, [rsp+padbase+0x100] %define pred(i,j) [r5+i*0x40+j*0x10-0x100] ; dc movhps m0, [r2+16] pxor m2, m2 psadbw m0, m2 MOVHL m1, m0 paddw m0, m1 psrlw m0, 3 pavgw m0, m2 pshufb m0, m2 mova pred(2,0), m0 psadbw m1, m0, fenc02 mova pred(2,1), m0 psadbw m2, m0, fenc13 mova pred(2,2), m0 psadbw m3, m0, fenc46 mova pred(2,3), m0 psadbw m0, m0, fenc57 paddw m1, m2 paddw m0, m3 paddw m0, m1 MOVHL m1, m0 paddw m0, m1 movd [r4+4], m0 ; ddl ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB ; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC ; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD ; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE ; Ft8 Ft9 FtA FtB FtC FtD FtE FtF mova m0, [r2+16] movu m2, [r2+17] pslldq m1, m0, 1 pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___ PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF pshufb m1, m0, [off(intra8x9_ddl1)] pshufb m2, m0, [off(intra8x9_ddl2)] mova pred(3,0), m1 psadbw m1, fenc02 mova pred(3,1), m2 psadbw m2, fenc13 paddw m1, m2 pshufb m2, m0, [off(intra8x9_ddl3)] mova pred(3,2), m2 psadbw m2, fenc46 paddw m1, m2 pshufb m2, m0, [off(intra8x9_ddl4)] mova pred(3,3), m2 psadbw m2, fenc57 paddw m1, m2 MOVHL m2, m1 paddw m1, m2 movd [r4+6], m1 ; vl ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 ; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 ; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA ; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB pshufb m1, m3, [off(intra8x9_vl1)] pshufb m2, m0, [off(intra8x9_vl2)] pshufb m3, m3, [off(intra8x9_vl3)] pshufb m0, m0, [off(intra8x9_vl4)] mova pred(7,0), m1 psadbw m1, fenc02 mova pred(7,1), m2 psadbw m2, fenc13 mova pred(7,2), m3 psadbw m3, fenc46 mova pred(7,3), m0 psadbw m0, fenc57 paddw m1, m2 paddw m0, m3 paddw m0, m1 MOVHL m1, m0 paddw m0, m1 %if cpuflag(sse4) pextrw [r4+14], m0, 0 %else movd r5d, m0 mov [r4+14], r5w lea r5, [rsp+padbase+0x100] %endif ; ddr ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 ; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 ; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 ; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 ; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 ; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt movu m2, [r2+8] movu m0, [r2+7] movu m1, [r2+6] pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 pshufb m1, m0, [off(intra8x9_ddr1)] pshufb m2, m0, [off(intra8x9_ddr2)] mova pred(4,0), m1 psadbw m1, fenc02 mova pred(4,1), m2 psadbw m2, fenc13 paddw m1, m2 pshufb m2, m0, [off(intra8x9_ddr3)] mova pred(4,2), m2 psadbw m2, fenc46 paddw m1, m2 pshufb m2, m0, [off(intra8x9_ddr4)] mova pred(4,3), m2 psadbw m2, fenc57 paddw m1, m2 MOVHL m2, m1 paddw m1, m2 movd [r4+8], m1 add r0, 256 add r5, 0xC0 %define off(m) (r0+m-(intra8x9_h1+256+128)) %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0] ; vr ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 ; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 ; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 ; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 pshufb m1, m2, [off(intra8x9_vr1)] pshufb m2, m2, [off(intra8x9_vr3)] mova pred(5,0), m1 psadbw m1, fenc02 mova pred(5,2), m2 psadbw m2, fenc46 paddw m1, m2 pshufb m2, m0, [off(intra8x9_vr2)] mova pred(5,1), m2 psadbw m2, fenc13 paddw m1, m2 pshufb m2, m0, [off(intra8x9_vr4)] mova pred(5,3), m2 psadbw m2, fenc57 paddw m1, m2 MOVHL m2, m1 paddw m1, m2 movd [r4+10], m1 ; hd ; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3 ; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1 ; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt ; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 ; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 ; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 pshufd m2, m3, q0001 %if cpuflag(sse4) pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___ %else movss m1, m0, m2 SWAP 1, 2 %endif punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___ pshufb m1, m2, [off(intra8x9_hd1)] pshufb m2, m2, [off(intra8x9_hd2)] mova pred(6,0), m1 psadbw m1, fenc02 mova pred(6,1), m2 psadbw m2, fenc13 paddw m1, m2 pshufb m2, m0, [off(intra8x9_hd3)] pshufb m3, m0, [off(intra8x9_hd4)] mova pred(6,2), m2 psadbw m2, fenc46 mova pred(6,3), m3 psadbw m3, fenc57 paddw m1, m2 paddw m1, m3 MOVHL m2, m1 paddw m1, m2 ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall pslldq m1, 12 SWAP 3, 1 ; hu ; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 ; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5 ; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 ; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 ; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 ; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 ; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 ; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 %if cpuflag(sse4) pinsrb m0, [r2+7], 15 ; Gl7 %else movd m1, [r2+7] pslldq m0, 1 palignr m1, m0, 1 SWAP 0, 1 %endif pshufb m1, m0, [off(intra8x9_hu1)] pshufb m2, m0, [off(intra8x9_hu2)] mova pred(8,0), m1 psadbw m1, fenc02 mova pred(8,1), m2 psadbw m2, fenc13 paddw m1, m2 pshufb m2, m0, [off(intra8x9_hu3)] pshufb m0, m0, [off(intra8x9_hu4)] mova pred(8,2), m2 psadbw m2, fenc46 mova pred(8,3), m0 psadbw m0, fenc57 paddw m1, m2 paddw m1, m0 MOVHL m2, m1 paddw m1, m2 movd r2d, m1 movu m0, [r3] por m3, [r4] paddw m0, m3 mova [r4], m0 movzx r5d, word [r3+16] add r2d, r5d mov [r4+16], r2w %if cpuflag(sse4) phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl movd eax, m0 %else ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index paddusw m0, m0 paddusw m0, m0 paddw m0, [off(pw_s00112233)] MOVHL m1, m0 pminsw m0, m1 pshuflw m1, m0, q0032 pminsw m0, m1 movd eax, m0 ; repack with 3 bit index xor eax, 0x80008000 movzx r3d, ax shr eax, 15 add r3d, r3d or eax, 1 cmp eax, r3d cmovg eax, r3d ; reverse to phminposuw order mov r3d, eax and eax, 7 shr r3d, 3 shl eax, 16 or eax, r3d %endif add r2d, 8<<16 cmp ax, r2w cmovg eax, r2d mov r2d, eax shr r2d, 16 shl r2d, 6 add r1, 4*FDEC_STRIDE mova m0, [rsp+padbase+r2+0x00] mova m1, [rsp+padbase+r2+0x10] mova m2, [rsp+padbase+r2+0x20] mova m3, [rsp+padbase+r2+0x30] movq [r1+FDEC_STRIDE*-4], m0 movhps [r1+FDEC_STRIDE*-2], m0 movq [r1+FDEC_STRIDE*-3], m1 movhps [r1+FDEC_STRIDE*-1], m1 movq [r1+FDEC_STRIDE* 0], m2 movhps [r1+FDEC_STRIDE* 2], m2 movq [r1+FDEC_STRIDE* 1], m3 movhps [r1+FDEC_STRIDE* 3], m3 ADD rsp, pad RET %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds ) ;----------------------------------------------------------------------------- cglobal intra_sa8d_x9_8x8, 5,6,16 %assign pad 0x2c0+0x10-gprsize-(stack_offset&15) %define fenc_buf rsp %define pred_buf rsp+0x80 SUB rsp, pad mova m15, [hmul_8p] pxor m8, m8 %assign %%i 0 %rep 8 movddup m %+ %%i, [r0+%%i*FENC_STRIDE] pmaddubsw m9, m %+ %%i, m15 punpcklbw m %+ %%i, m8 mova [fenc_buf+%%i*0x10], m9 %assign %%i %%i+1 %endrep ; save instruction size: avoid 4-byte memory offsets lea r0, [intra8x9_h1+0x80] %define off(m) (r0+m-(intra8x9_h1+0x80)) lea r5, [pred_buf+0x80] ; v, h, dc HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8 pabsw m11, m1 %assign %%i 2 %rep 6 pabsw m8, m %+ %%i paddw m11, m8 %assign %%i %%i+1 %endrep ; 1D hadamard of edges movq m8, [r2+7] movddup m9, [r2+16] mova [r5-0x80], m9 mova [r5-0x70], m9 mova [r5-0x60], m9 mova [r5-0x50], m9 punpcklwd m8, m8 pshufb m9, [intrax3_shuf] pmaddubsw m8, [pb_pppm] pmaddubsw m9, [pb_pppm] HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm] HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm] ; dc paddw m10, m8, m9 paddw m10, [pw_8] pand m10, [sw_f0] psrlw m12, m10, 4 psllw m10, 2 pxor m13, m13 pshufb m12, m13 mova [r5+0x00], m12 mova [r5+0x10], m12 mova [r5+0x20], m12 mova [r5+0x30], m12 ; differences psllw m8, 3 ; left edge psubw m8, m0 psubw m10, m0 pabsw m8, m8 ; 1x8 sum pabsw m10, m10 paddw m8, m11 paddw m11, m10 punpcklwd m0, m1 punpcklwd m2, m3 punpcklwd m4, m5 punpcklwd m6, m7 punpckldq m0, m2 punpckldq m4, m6 punpcklqdq m0, m4 ; transpose psllw m9, 3 ; top edge psrldq m10, m11, 2 ; 8x7 sum psubw m0, m9 ; 8x1 sum pabsw m0, m0 paddw m10, m0 phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow psrlw m11, 1 psrlw m10, 1 ; store h movq m3, [r2+7] pshufb m0, m3, [off(intra8x9_h1)] pshufb m1, m3, [off(intra8x9_h2)] pshufb m2, m3, [off(intra8x9_h3)] pshufb m3, m3, [off(intra8x9_h4)] mova [r5-0x40], m0 mova [r5-0x30], m1 mova [r5-0x20], m2 mova [r5-0x10], m3 ; ddl mova m8, [r2+16] movu m2, [r2+17] pslldq m1, m8, 1 pavgb m9, m8, m2 PRED4x4_LOWPASS m8, m1, m2, m8, m3 pshufb m0, m8, [off(intra8x9_ddl1)] pshufb m1, m8, [off(intra8x9_ddl2)] pshufb m2, m8, [off(intra8x9_ddl3)] pshufb m3, m8, [off(intra8x9_ddl4)] add r5, 0x40 call .sa8d phaddd m11, m0 ; vl pshufb m0, m9, [off(intra8x9_vl1)] pshufb m1, m8, [off(intra8x9_vl2)] pshufb m2, m9, [off(intra8x9_vl3)] pshufb m3, m8, [off(intra8x9_vl4)] add r5, 0x100 call .sa8d phaddd m10, m11 mova m12, m0 ; ddr movu m2, [r2+8] movu m8, [r2+7] movu m1, [r2+6] pavgb m9, m2, m8 PRED4x4_LOWPASS m8, m1, m2, m8, m3 pshufb m0, m8, [off(intra8x9_ddr1)] pshufb m1, m8, [off(intra8x9_ddr2)] pshufb m2, m8, [off(intra8x9_ddr3)] pshufb m3, m8, [off(intra8x9_ddr4)] sub r5, 0xc0 call .sa8d mova m11, m0 add r0, 0x100 %define off(m) (r0+m-(intra8x9_h1+0x180)) ; vr movsd m2, m9, m8 pshufb m0, m2, [off(intra8x9_vr1)] pshufb m1, m8, [off(intra8x9_vr2)] pshufb m2, m2, [off(intra8x9_vr3)] pshufb m3, m8, [off(intra8x9_vr4)] add r5, 0x40 call .sa8d phaddd m11, m0 ; hd %if cpuflag(sse4) pshufd m1, m9, q0001 pblendw m1, m8, q3330 %else pshufd m2, m9, q0001 movss m1, m8, m2 %endif punpcklbw m8, m9 pshufb m0, m1, [off(intra8x9_hd1)] pshufb m1, m1, [off(intra8x9_hd2)] pshufb m2, m8, [off(intra8x9_hd3)] pshufb m3, m8, [off(intra8x9_hd4)] add r5, 0x40 call .sa8d phaddd m0, m12 phaddd m11, m0 ; hu %if cpuflag(sse4) pinsrb m8, [r2+7], 15 %else movd m9, [r2+7] pslldq m8, 1 palignr m9, m8, 1 SWAP 8, 9 %endif pshufb m0, m8, [off(intra8x9_hu1)] pshufb m1, m8, [off(intra8x9_hu2)] pshufb m2, m8, [off(intra8x9_hu3)] pshufb m3, m8, [off(intra8x9_hu4)] add r5, 0x80 call .sa8d pmaddwd m0, [pw_1] phaddw m10, m11 MOVHL m1, m0 paddw m0, m1 pshuflw m1, m0, q0032 pavgw m0, m1 pxor m2, m2 pavgw m10, m2 movd r2d, m0 movu m0, [r3] paddw m0, m10 mova [r4], m0 movzx r5d, word [r3+16] add r2d, r5d mov [r4+16], r2w %if cpuflag(sse4) phminposuw m0, m0 movd eax, m0 %else ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index paddusw m0, m0 paddw m0, [off(pw_s00001111)] MOVHL m1, m0 pminsw m0, m1 pshuflw m1, m0, q0032 mova m2, m0 pminsw m0, m1 pcmpgtw m2, m1 ; 2nd index bit movd r3d, m0 movd r4d, m2 ; repack with 3 bit index xor r3d, 0x80008000 and r4d, 0x00020002 movzx eax, r3w movzx r5d, r4w shr r3d, 16 shr r4d, 16 lea eax, [rax*4+r5] lea r3d, [ r3*4+r4+1] cmp eax, r3d cmovg eax, r3d ; reverse to phminposuw order mov r3d, eax and eax, 7 shr r3d, 3 shl eax, 16 or eax, r3d %endif add r2d, 8<<16 cmp ax, r2w cmovg eax, r2d mov r2d, eax shr r2d, 16 shl r2d, 6 add r1, 4*FDEC_STRIDE mova m0, [pred_buf+r2+0x00] mova m1, [pred_buf+r2+0x10] mova m2, [pred_buf+r2+0x20] mova m3, [pred_buf+r2+0x30] movq [r1+FDEC_STRIDE*-4], m0 movhps [r1+FDEC_STRIDE*-2], m0 movq [r1+FDEC_STRIDE*-3], m1 movhps [r1+FDEC_STRIDE*-1], m1 movq [r1+FDEC_STRIDE* 0], m2 movhps [r1+FDEC_STRIDE* 2], m2 movq [r1+FDEC_STRIDE* 1], m3 movhps [r1+FDEC_STRIDE* 3], m3 ADD rsp, pad RET ALIGN 16 .sa8d: %xdefine mret m0 %xdefine fenc_buf fenc_buf+gprsize mova [r5+0x00], m0 mova [r5+0x10], m1 mova [r5+0x20], m2 mova [r5+0x30], m3 movddup m4, m0 movddup m5, m1 movddup m6, m2 movddup m7, m3 punpckhqdq m0, m0 punpckhqdq m1, m1 punpckhqdq m2, m2 punpckhqdq m3, m3 PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3 pmaddubsw m0, m15 pmaddubsw m1, m15 psubw m0, [fenc_buf+0x00] psubw m1, [fenc_buf+0x10] pmaddubsw m2, m15 pmaddubsw m3, m15 psubw m2, [fenc_buf+0x20] psubw m3, [fenc_buf+0x30] pmaddubsw m4, m15 pmaddubsw m5, m15 psubw m4, [fenc_buf+0x40] psubw m5, [fenc_buf+0x50] pmaddubsw m6, m15 pmaddubsw m7, m15 psubw m6, [fenc_buf+0x60] psubw m7, [fenc_buf+0x70] HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14 paddw m0, m1 paddw m0, m2 paddw mret, m0, m3 ret %endif ; ARCH_X86_64 %endmacro ; INTRA8_X9 ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0 ; out: [tmp]=hadamard4, m0=satd INIT_MMX mmx2 cglobal hadamard_ac_4x4 %if HIGH_BIT_DEPTH mova m0, [r0] mova m1, [r0+r1] mova m2, [r0+r1*2] mova m3, [r0+r2] %else ; !HIGH_BIT_DEPTH movh m0, [r0] movh m1, [r0+r1] movh m2, [r0+r1*2] movh m3, [r0+r2] punpcklbw m0, m7 punpcklbw m1, m7 punpcklbw m2, m7 punpcklbw m3, m7 %endif ; HIGH_BIT_DEPTH HADAMARD4_2D 0, 1, 2, 3, 4 mova [r3], m0 mova [r3+8], m1 mova [r3+16], m2 mova [r3+24], m3 ABSW m0, m0, m4 ABSW m1, m1, m4 pand m0, m6 ABSW m2, m2, m4 ABSW m3, m3, m4 paddw m0, m1 paddw m2, m3 paddw m0, m2 SAVE_MM_PERMUTATION ret cglobal hadamard_ac_2x2max mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] sub r3, 8 SUMSUB_BADC w, 0, 1, 2, 3, 4 ABSW2 m0, m2, m0, m2, m4, m5 ABSW2 m1, m3, m1, m3, m4, m5 HADAMARD 0, max, 0, 2, 4, 5 HADAMARD 0, max, 1, 3, 4, 5 %if HIGH_BIT_DEPTH pmaddwd m0, m7 pmaddwd m1, m7 paddd m6, m0 paddd m6, m1 %else ; !HIGH_BIT_DEPTH paddw m7, m0 paddw m7, m1 %endif ; HIGH_BIT_DEPTH SAVE_MM_PERMUTATION ret %macro AC_PREP 2 %if HIGH_BIT_DEPTH pmaddwd %1, %2 %endif %endmacro %macro AC_PADD 3 %if HIGH_BIT_DEPTH AC_PREP %2, %3 paddd %1, %2 %else paddw %1, %2 %endif ; HIGH_BIT_DEPTH %endmacro cglobal hadamard_ac_8x8 mova m6, [mask_ac4] %if HIGH_BIT_DEPTH mova m7, [pw_1] %else pxor m7, m7 %endif ; HIGH_BIT_DEPTH call hadamard_ac_4x4_mmx2 add r0, 4*SIZEOF_PIXEL add r3, 32 mova m5, m0 AC_PREP m5, m7 call hadamard_ac_4x4_mmx2 lea r0, [r0+4*r1] add r3, 64 AC_PADD m5, m0, m7 call hadamard_ac_4x4_mmx2 sub r0, 4*SIZEOF_PIXEL sub r3, 32 AC_PADD m5, m0, m7 call hadamard_ac_4x4_mmx2 AC_PADD m5, m0, m7 sub r3, 40 mova [rsp+gprsize+8], m5 ; save satd %if HIGH_BIT_DEPTH pxor m6, m6 %endif %rep 3 call hadamard_ac_2x2max_mmx2 %endrep mova m0, [r3+0x00] mova m1, [r3+0x20] mova m2, [r3+0x40] mova m3, [r3+0x60] SUMSUB_BADC w, 0, 1, 2, 3, 4 HADAMARD 0, sumsub, 0, 2, 4, 5 ABSW2 m1, m3, m1, m3, m4, m5 ABSW2 m0, m2, m0, m2, m4, m5 HADAMARD 0, max, 1, 3, 4, 5 %if HIGH_BIT_DEPTH pand m0, [mask_ac4] pmaddwd m1, m7 pmaddwd m0, m7 pmaddwd m2, m7 paddd m6, m1 paddd m0, m2 paddd m6, m6 paddd m0, m6 SWAP 0, 6 %else ; !HIGH_BIT_DEPTH pand m6, m0 paddw m7, m1 paddw m6, m2 paddw m7, m7 paddw m6, m7 %endif ; HIGH_BIT_DEPTH mova [rsp+gprsize], m6 ; save sa8d SWAP 0, 6 SAVE_MM_PERMUTATION ret %macro HADAMARD_AC_WXH_SUM_MMX 2 mova m1, [rsp+1*mmsize] %if HIGH_BIT_DEPTH %if %1*%2 >= 128 paddd m0, [rsp+2*mmsize] paddd m1, [rsp+3*mmsize] %endif %if %1*%2 == 256 mova m2, [rsp+4*mmsize] paddd m1, [rsp+5*mmsize] paddd m2, [rsp+6*mmsize] mova m3, m0 paddd m1, [rsp+7*mmsize] paddd m0, m2 %endif psrld m0, 1 HADDD m0, m2 psrld m1, 1 HADDD m1, m3 %else ; !HIGH_BIT_DEPTH %if %1*%2 >= 128 paddusw m0, [rsp+2*mmsize] paddusw m1, [rsp+3*mmsize] %endif %if %1*%2 == 256 mova m2, [rsp+4*mmsize] paddusw m1, [rsp+5*mmsize] paddusw m2, [rsp+6*mmsize] mova m3, m0 paddusw m1, [rsp+7*mmsize] pxor m3, m2 pand m3, [pw_1] pavgw m0, m2 psubusw m0, m3 HADDUW m0, m2 %else psrlw m0, 1 HADDW m0, m2 %endif psrlw m1, 1 HADDW m1, m3 %endif ; HIGH_BIT_DEPTH %endmacro %macro HADAMARD_AC_WXH_MMX 2 cglobal pixel_hadamard_ac_%1x%2, 2,4 %assign pad 16-gprsize-(stack_offset&15) %define ysub r1 FIX_STRIDES r1 sub rsp, 16+128+pad lea r2, [r1*3] lea r3, [rsp+16] call hadamard_ac_8x8_mmx2 %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, 16 call hadamard_ac_8x8_mmx2 %endif %if %1==16 neg ysub sub rsp, 16 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8_mmx2 %if %2==16 lea r0, [r0+r1*4] sub rsp, 16 call hadamard_ac_8x8_mmx2 %endif %endif HADAMARD_AC_WXH_SUM_MMX %1, %2 movd edx, m0 movd eax, m1 shr edx, 1 %if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif add rsp, 128+%1*%2/4+pad RET %endmacro ; HADAMARD_AC_WXH_MMX HADAMARD_AC_WXH_MMX 16, 16 HADAMARD_AC_WXH_MMX 8, 16 HADAMARD_AC_WXH_MMX 16, 8 HADAMARD_AC_WXH_MMX 8, 8 %macro LOAD_INC_8x4W_SSE2 5 %if HIGH_BIT_DEPTH movu m%1, [r0] movu m%2, [r0+r1] movu m%3, [r0+r1*2] movu m%4, [r0+r2] %ifidn %1, 0 lea r0, [r0+r1*4] %endif %else ; !HIGH_BIT_DEPTH movh m%1, [r0] movh m%2, [r0+r1] movh m%3, [r0+r1*2] movh m%4, [r0+r2] %ifidn %1, 0 lea r0, [r0+r1*4] %endif punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 %endif ; HIGH_BIT_DEPTH %endmacro %macro LOAD_INC_8x4W_SSSE3 5 LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1] %ifidn %1, 0 lea r0, [r0+r1*4] %endif HSUMSUB %1, %2, %3, %4, %5 %endmacro %macro HADAMARD_AC_SSE2 0 ; in: r0=pix, r1=stride, r2=stride*3 ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4 cglobal hadamard_ac_8x8 %if ARCH_X86_64 %define spill0 m8 %define spill1 m9 %define spill2 m10 %else %define spill0 [rsp+gprsize] %define spill1 [rsp+gprsize+mmsize] %define spill2 [rsp+gprsize+mmsize*2] %endif %if HIGH_BIT_DEPTH %define vertical 1 %elif cpuflag(ssse3) && notcpuflag(atom) %define vertical 0 ;LOAD_INC loads sumsubs mova m7, [hmul_8p] %else %define vertical 1 ;LOAD_INC only unpacks to words pxor m7, m7 %endif LOAD_INC_8x4W 0, 1, 2, 3, 7 %if vertical HADAMARD4_2D_SSE 0, 1, 2, 3, 4 %else HADAMARD4_V 0, 1, 2, 3, 4 %endif mova spill0, m1 SWAP 1, 7 LOAD_INC_8x4W 4, 5, 6, 7, 1 %if vertical HADAMARD4_2D_SSE 4, 5, 6, 7, 1 %else HADAMARD4_V 4, 5, 6, 7, 1 ; FIXME SWAP mova m1, spill0 mova spill0, m6 mova spill1, m7 HADAMARD 1, sumsub, 0, 1, 6, 7 HADAMARD 1, sumsub, 2, 3, 6, 7 mova m6, spill0 mova m7, spill1 mova spill0, m1 mova spill1, m0 HADAMARD 1, sumsub, 4, 5, 1, 0 HADAMARD 1, sumsub, 6, 7, 1, 0 mova m0, spill1 %endif mova spill1, m2 mova spill2, m3 ABSW m1, m0, m0 ABSW m2, m4, m4 ABSW m3, m5, m5 paddw m1, m2 SUMSUB_BA w, 0, 4 %if vertical pand m1, [mask_ac4] %else pand m1, [mask_ac4b] %endif AC_PREP m1, [pw_1] ABSW m2, spill0 AC_PADD m1, m3, [pw_1] ABSW m3, spill1 AC_PADD m1, m2, [pw_1] ABSW m2, spill2 AC_PADD m1, m3, [pw_1] ABSW m3, m6, m6 AC_PADD m1, m2, [pw_1] ABSW m2, m7, m7 AC_PADD m1, m3, [pw_1] AC_PADD m1, m2, [pw_1] paddw m3, m7, spill2 psubw m7, spill2 mova [rsp+gprsize+mmsize*2], m1 ; save satd paddw m2, m6, spill1 psubw m6, spill1 paddw m1, m5, spill0 psubw m5, spill0 %assign %%x 2 %if vertical %assign %%x 4 %endif mova spill1, m4 HADAMARD %%x, amax, 3, 7, 4 HADAMARD %%x, amax, 2, 6, 7, 4 mova m4, spill1 HADAMARD %%x, amax, 1, 5, 6, 7 HADAMARD %%x, sumsub, 0, 4, 5, 6 AC_PREP m2, [pw_1] AC_PADD m2, m3, [pw_1] AC_PADD m2, m1, [pw_1] %if HIGH_BIT_DEPTH paddd m2, m2 %else paddw m2, m2 %endif ; HIGH_BIT_DEPTH ABSW m4, m4, m7 pand m0, [mask_ac8] ABSW m0, m0, m7 AC_PADD m2, m4, [pw_1] AC_PADD m2, m0, [pw_1] mova [rsp+gprsize+mmsize], m2 ; save sa8d SWAP 0, 2 SAVE_MM_PERMUTATION ret HADAMARD_AC_WXH_SSE2 16, 16 HADAMARD_AC_WXH_SSE2 16, 8 %if mmsize <= 16 HADAMARD_AC_WXH_SSE2 8, 16 HADAMARD_AC_WXH_SSE2 8, 8 %endif %endmacro ; HADAMARD_AC_SSE2 %macro HADAMARD_AC_WXH_SUM_SSE2 2 mova m1, [rsp+2*mmsize] %if HIGH_BIT_DEPTH %if %1*%2 >= 128 paddd m0, [rsp+3*mmsize] paddd m1, [rsp+4*mmsize] %endif %if %1*%2 == 256 paddd m0, [rsp+5*mmsize] paddd m1, [rsp+6*mmsize] paddd m0, [rsp+7*mmsize] paddd m1, [rsp+8*mmsize] psrld m0, 1 %endif HADDD xm0, xm2 HADDD xm1, xm3 %else ; !HIGH_BIT_DEPTH %if %1*%2*16/mmsize >= 128 paddusw m0, [rsp+3*mmsize] paddusw m1, [rsp+4*mmsize] %endif %if %1*%2*16/mmsize == 256 paddusw m0, [rsp+5*mmsize] paddusw m1, [rsp+6*mmsize] paddusw m0, [rsp+7*mmsize] paddusw m1, [rsp+8*mmsize] psrlw m0, 1 %endif %if mmsize==32 vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 paddusw xm0, xm2 paddusw xm1, xm3 %endif HADDUW xm0, xm2 HADDW xm1, xm3 %endif ; HIGH_BIT_DEPTH %endmacro ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride ) %macro HADAMARD_AC_WXH_SSE2 2 cglobal pixel_hadamard_ac_%1x%2, 2,4,11 %define ysub r1 FIX_STRIDES r1 mov r3, rsp and rsp, ~(mmsize-1) sub rsp, mmsize*3 lea r2, [r1*3] call hadamard_ac_8x8 %if %2==16 %define ysub r2 lea r0, [r0+r1*4] sub rsp, mmsize*2 call hadamard_ac_8x8 %endif %if %1==16 && mmsize <= 16 neg ysub sub rsp, mmsize*2 lea r0, [r0+ysub*4+8*SIZEOF_PIXEL] neg ysub call hadamard_ac_8x8 %if %2==16 lea r0, [r0+r1*4] sub rsp, mmsize*2 call hadamard_ac_8x8 %endif %endif HADAMARD_AC_WXH_SUM_SSE2 %1, %2 movd edx, xm0 movd eax, xm1 shr edx, 2 - (%1*%2*16/mmsize >> 8) shr eax, 1 %if ARCH_X86_64 shl rdx, 32 add rax, rdx %endif mov rsp, r3 RET %endmacro ; HADAMARD_AC_WXH_SSE2 ; instantiate satds %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0 cextern pixel_sa8d_8x8_internal_mmx2 INIT_MMX mmx2 SA8D %endif %define TRANS TRANS_SSE2 %define DIFFOP DIFF_UNPACK_SSE2 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2 %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2 %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size %define movdqu movups %define punpcklqdq movlhps INIT_XMM sse2 SA8D SATDS_SSE2 %if ARCH_X86_64 SA8D_SATD %endif %if HIGH_BIT_DEPTH == 0 INTRA_SA8D_SSE2 %endif INIT_MMX mmx2 INTRA_X3_MMX INIT_XMM sse2 HADAMARD_AC_SSE2 %if HIGH_BIT_DEPTH == 0 INIT_XMM ssse3,atom SATDS_SSE2 SA8D HADAMARD_AC_SSE2 %if ARCH_X86_64 SA8D_SATD %endif %endif %define DIFFOP DIFF_SUMSUB_SSSE3 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE %if HIGH_BIT_DEPTH == 0 %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3 %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3 %endif INIT_XMM ssse3 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 %if ARCH_X86_64 SA8D_SATD %endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif %undef movdqa ; nehalem doesn't like movaps %undef movdqu ; movups %undef punpcklqdq ; or movlhps %if HIGH_BIT_DEPTH == 0 INIT_MMX ssse3 INTRA_X3_MMX %endif %define TRANS TRANS_SSE4 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN INIT_XMM sse4 SATDS_SSE2 SA8D HADAMARD_AC_SSE2 %if ARCH_X86_64 SA8D_SATD %endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so ; it's effectively free. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE INIT_XMM avx SATDS_SSE2 SA8D %if ARCH_X86_64 SA8D_SATD %endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 INTRA8_X9 %endif HADAMARD_AC_SSE2 %define TRANS TRANS_XOP INIT_XMM xop SATDS_SSE2 SA8D %if ARCH_X86_64 SA8D_SATD %endif %if HIGH_BIT_DEPTH == 0 INTRA_X9 ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why. %endif HADAMARD_AC_SSE2 %if HIGH_BIT_DEPTH == 0 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2 %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2 %define TRANS TRANS_SSE4 INIT_YMM avx2 HADAMARD_AC_SSE2 %if ARCH_X86_64 SA8D_SATD %endif %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul] movq xm%1, [r0] movq xm%3, [r2] movq xm%2, [r0+r1] movq xm%4, [r2+r3] vinserti128 m%1, m%1, [r0+4*r1], 1 vinserti128 m%3, m%3, [r2+4*r3], 1 vinserti128 m%2, m%2, [r0+r4], 1 vinserti128 m%4, m%4, [r2+r5], 1 punpcklqdq m%1, m%1 punpcklqdq m%3, m%3 punpcklqdq m%2, m%2 punpcklqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7 lea r0, [r0+2*r1] lea r2, [r2+2*r3] movq xm%3, [r0] movq xm%5, [r2] movq xm%4, [r0+r1] movq xm%6, [r2+r3] vinserti128 m%3, m%3, [r0+4*r1], 1 vinserti128 m%5, m%5, [r2+4*r3], 1 vinserti128 m%4, m%4, [r0+r4], 1 vinserti128 m%6, m%6, [r2+r5], 1 punpcklqdq m%3, m%3 punpcklqdq m%5, m%5 punpcklqdq m%4, m%4 punpcklqdq m%6, m%6 DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7 %endmacro %macro SATD_START_AVX2 2-3 0 FIX_STRIDES r1, r3 %if %3 mova %2, [hmul_8p] lea r4, [5*r1] lea r5, [5*r3] %else mova %2, [hmul_16p] lea r4, [3*r1] lea r5, [3*r3] %endif pxor %1, %1 %endmacro %define TRANS TRANS_SSE4 INIT_YMM avx2 cglobal pixel_satd_16x8_internal LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_16x16, 4,6,8 SATD_START_AVX2 m6, m7 call pixel_satd_16x8_internal lea r0, [r0+4*r1] lea r2, [r2+4*r3] pixel_satd_16x8_internal: call pixel_satd_16x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_satd_16x8, 4,6,8 SATD_START_AVX2 m6, m7 jmp pixel_satd_16x8_internal cglobal pixel_satd_8x8_internal LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6 ret cglobal pixel_satd_8x16, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_satd_8x8_internal lea r0, [r0+2*r1] lea r2, [r2+2*r3] lea r0, [r0+4*r1] lea r2, [r2+4*r3] call pixel_satd_8x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_satd_8x8, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_satd_8x8_internal vextracti128 xm0, m6, 1 paddw xm0, xm6 SATD_END_SSE2 xm0 RET cglobal pixel_sa8d_8x8_internal LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7 HADAMARD4_V 0, 1, 2, 3, 4 HADAMARD 8, sumsub, 0, 1, 4, 5 HADAMARD 8, sumsub, 2, 3, 4, 5 HADAMARD 2, sumsub, 0, 1, 4, 5 HADAMARD 2, sumsub, 2, 3, 4, 5 HADAMARD 1, amax, 0, 1, 4, 5 HADAMARD 1, amax, 2, 3, 4, 5 paddw m6, m0 paddw m6, m2 ret cglobal pixel_sa8d_8x8, 4,6,8 SATD_START_AVX2 m6, m7, 1 call pixel_sa8d_8x8_internal vextracti128 xm1, m6, 1 paddw xm6, xm1 HADDW xm6, xm1 movd eax, xm6 add eax, 1 shr eax, 1 RET cglobal intra_sad_x9_8x8, 5,7,8 %define pred(i,j) [rsp+i*0x40+j*0x20] mov r6, rsp and rsp, ~31 sub rsp, 0x240 movu m5, [r0+0*FENC_STRIDE] movu m6, [r0+4*FENC_STRIDE] punpcklqdq m5, [r0+2*FENC_STRIDE] punpcklqdq m6, [r0+6*FENC_STRIDE] ; save instruction size: avoid 4-byte memory offsets lea r0, [intra8x9_h1+128] %define off(m) (r0+m-(intra8x9_h1+128)) vpbroadcastq m0, [r2+16] psadbw m4, m0, m5 psadbw m2, m0, m6 mova pred(0,0), m0 mova pred(0,1), m0 paddw m4, m2 vpbroadcastq m1, [r2+7] pshufb m3, m1, [off(intra8x9_h1)] pshufb m2, m1, [off(intra8x9_h3)] mova pred(1,0), m3 mova pred(1,1), m2 psadbw m3, m5 psadbw m2, m6 paddw m3, m2 lea r5, [rsp+0x100] %define pred(i,j) [r5+i*0x40+j*0x20-0x100] ; combine the first two pslldq m3, 2 por m4, m3 pxor m2, m2 psadbw m0, m2 psadbw m1, m2 paddw m0, m1 psrlw m0, 3 pavgw m0, m2 pshufb m0, m2 mova pred(2,0), m0 mova pred(2,1), m0 psadbw m3, m0, m5 psadbw m2, m0, m6 paddw m3, m2 pslldq m3, 4 por m4, m3 vbroadcasti128 m0, [r2+16] vbroadcasti128 m2, [r2+17] pslldq m1, m0, 1 pavgb m3, m0, m2 PRED4x4_LOWPASS m0, m1, m2, m0, m7 pshufb m1, m0, [off(intra8x9_ddl1)] pshufb m2, m0, [off(intra8x9_ddl3)] mova pred(3,0), m1 mova pred(3,1), m2 psadbw m1, m5 psadbw m2, m6 paddw m1, m2 pslldq m1, 6 por m4, m1 vextracti128 xm1, m4, 1 paddw xm4, xm1 mova [r4], xm4 ; for later vinserti128 m7, m3, xm0, 1 vbroadcasti128 m2, [r2+8] vbroadcasti128 m0, [r2+7] vbroadcasti128 m1, [r2+6] pavgb m3, m2, m0 PRED4x4_LOWPASS m0, m1, m2, m0, m4 pshufb m1, m0, [off(intra8x9_ddr1)] pshufb m2, m0, [off(intra8x9_ddr3)] mova pred(4,0), m1 mova pred(4,1), m2 psadbw m4, m1, m5 psadbw m2, m6 paddw m4, m2 add r0, 256 add r5, 0xC0 %define off(m) (r0+m-(intra8x9_h1+256+128)) %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0] vpblendd m2, m3, m0, 11110011b pshufb m1, m2, [off(intra8x9_vr1)] pshufb m2, m2, [off(intra8x9_vr3)] mova pred(5,0), m1 mova pred(5,1), m2 psadbw m1, m5 psadbw m2, m6 paddw m1, m2 pslldq m1, 2 por m4, m1 psrldq m2, m3, 4 pblendw m2, m0, q3330 punpcklbw m0, m3 pshufb m1, m2, [off(intra8x9_hd1)] pshufb m2, m0, [off(intra8x9_hd3)] mova pred(6,0), m1 mova pred(6,1), m2 psadbw m1, m5 psadbw m2, m6 paddw m1, m2 pslldq m1, 4 por m4, m1 pshufb m1, m7, [off(intra8x9_vl1)] pshufb m2, m7, [off(intra8x9_vl3)] mova pred(7,0), m1 mova pred(7,1), m2 psadbw m1, m5 psadbw m2, m6 paddw m1, m2 pslldq m1, 6 por m4, m1 vextracti128 xm1, m4, 1 paddw xm4, xm1 mova xm3, [r4] SBUTTERFLY qdq, 3, 4, 7 paddw xm3, xm4 pslldq m1, m0, 1 vpbroadcastd m0, [r2+7] palignr m0, m1, 1 pshufb m1, m0, [off(intra8x9_hu1)] pshufb m2, m0, [off(intra8x9_hu3)] mova pred(8,0), m1 mova pred(8,1), m2 psadbw m1, m5 psadbw m2, m6 paddw m1, m2 vextracti128 xm2, m1, 1 paddw xm1, xm2 MOVHL xm2, xm1 paddw xm1, xm2 movd r2d, xm1 paddw xm3, [r3] mova [r4], xm3 add r2w, word [r3+16] mov [r4+16], r2w phminposuw xm3, xm3 movd r3d, xm3 add r2d, 8<<16 cmp r3w, r2w cmovg r3d, r2d mov r2d, r3d shr r3, 16 shl r3, 6 add r1, 4*FDEC_STRIDE mova xm0, [rsp+r3+0x00] mova xm1, [rsp+r3+0x10] mova xm2, [rsp+r3+0x20] mova xm3, [rsp+r3+0x30] movq [r1+FDEC_STRIDE*-4], xm0 movhps [r1+FDEC_STRIDE*-2], xm0 movq [r1+FDEC_STRIDE*-3], xm1 movhps [r1+FDEC_STRIDE*-1], xm1 movq [r1+FDEC_STRIDE* 0], xm2 movhps [r1+FDEC_STRIDE* 2], xm2 movq [r1+FDEC_STRIDE* 1], xm3 movhps [r1+FDEC_STRIDE* 3], xm3 mov rsp, r6 mov eax, r2d RET %macro SATD_AVX512_LOAD4 2 ; size, opmask vpbroadcast%1 m0, [r0] vpbroadcast%1 m0 {%2}, [r0+2*r1] vpbroadcast%1 m2, [r2] vpbroadcast%1 m2 {%2}, [r2+2*r3] add r0, r1 add r2, r3 vpbroadcast%1 m1, [r0] vpbroadcast%1 m1 {%2}, [r0+2*r1] vpbroadcast%1 m3, [r2] vpbroadcast%1 m3 {%2}, [r2+2*r3] %endmacro %macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3 vpbroadcast%1 %{2}0, [r0] vpbroadcast%1 %{2}0 {%3}, [r0+2*r1] vpbroadcast%1 %{2}2, [r2] vpbroadcast%1 %{2}2 {%3}, [r2+2*r3] vpbroadcast%1 m0 {%4}, [r0+4*r1] vpbroadcast%1 m2 {%4}, [r2+4*r3] vpbroadcast%1 m0 {%5}, [r0+2*r4] vpbroadcast%1 m2 {%5}, [r2+2*r5] vpbroadcast%1 %{2}1, [r0+r1] vpbroadcast%1 %{2}1 {%3}, [r0+r4] vpbroadcast%1 %{2}3, [r2+r3] vpbroadcast%1 %{2}3 {%3}, [r2+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] vpbroadcast%1 m1 {%4}, [r0+r1] vpbroadcast%1 m3 {%4}, [r2+r3] vpbroadcast%1 m1 {%5}, [r0+r4] vpbroadcast%1 m3 {%5}, [r2+r5] %endmacro %macro SATD_AVX512_PACKED 0 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 SUMSUB_BA w, 0, 1, 2 SBUTTERFLY qdq, 0, 1, 2 SUMSUB_BA w, 0, 1, 2 HMAXABSW2 0, 1, 2, 3 %endmacro %macro SATD_AVX512_END 0-1 0 ; sa8d vpaddw m0 {k1}{z}, m1 ; zero-extend to dwords %if ARCH_X86_64 %if mmsize == 64 vextracti32x8 ym1, m0, 1 paddd ym0, ym1 %endif %if mmsize >= 32 vextracti128 xm1, ym0, 1 paddd xmm0, xm0, xm1 %endif punpckhqdq xmm1, xmm0, xmm0 paddd xmm0, xmm1 movq rax, xmm0 rorx rdx, rax, 32 %if %1 lea eax, [rax+rdx+1] shr eax, 1 %else add eax, edx %endif %else HADDD m0, m1 movd eax, xm0 %if %1 inc eax shr eax, 1 %endif %endif RET %endmacro %macro HMAXABSW2 4 ; a, b, tmp1, tmp2 pabsw m%1, m%1 pabsw m%2, m%2 psrldq m%3, m%1, 2 psrld m%4, m%2, 16 pmaxsw m%1, m%3 pmaxsw m%2, m%4 %endmacro INIT_ZMM avx512 cglobal pixel_satd_16x8_internal vbroadcasti64x4 m6, [hmul_16p] kxnorb k2, k2, k2 mov r4d, 0x55555555 knotw k2, k2 kmovd k1, r4d lea r4, [3*r1] lea r5, [3*r3] satd_16x8_avx512: vbroadcasti128 ym0, [r0] vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4 vbroadcasti128 ym4, [r2] vbroadcasti32x4 m4 {k2}, [r2+4*r3] vbroadcasti128 ym2, [r0+2*r1] vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6 vbroadcasti128 ym5, [r2+2*r3] vbroadcasti32x4 m5 {k2}, [r2+2*r5] DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6 vbroadcasti128 ym1, [r0+r1] vbroadcasti128 ym4, [r2+r3] vbroadcasti128 ym3, [r0+r4] vbroadcasti128 ym5, [r2+r5] lea r0, [r0+4*r1] lea r2, [r2+4*r3] vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5 vbroadcasti32x4 m4 {k2}, [r2+r3] vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7 vbroadcasti32x4 m5 {k2}, [r2+r5] DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6 HADAMARD4_V 0, 1, 2, 3, 4 HMAXABSW2 0, 2, 4, 5 HMAXABSW2 1, 3, 4, 5 paddw m4, m0, m2 ; m1 paddw m2, m1, m3 ; m0 ret cglobal pixel_satd_8x8_internal vbroadcasti64x4 m4, [hmul_16p] mov r4d, 0x55555555 kmovd k1, r4d ; 01010101 kshiftlb k2, k1, 5 ; 10100000 kshiftlb k3, k1, 4 ; 01010000 lea r4, [3*r1] lea r5, [3*r3] satd_8x8_avx512: SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5 ret cglobal pixel_satd_16x8, 4,6 call pixel_satd_16x8_internal_avx512 jmp satd_zmm_avx512_end cglobal pixel_satd_16x16, 4,6 call pixel_satd_16x8_internal_avx512 lea r0, [r0+4*r1] lea r2, [r2+4*r3] paddw m7, m0, m1 call satd_16x8_avx512 paddw m1, m7 jmp satd_zmm_avx512_end cglobal pixel_satd_8x8, 4,6 call pixel_satd_8x8_internal_avx512 satd_zmm_avx512_end: SATD_AVX512_END cglobal pixel_satd_8x16, 4,6 call pixel_satd_8x8_internal_avx512 lea r0, [r0+4*r1] lea r2, [r2+4*r3] paddw m5, m0, m1 call satd_8x8_avx512 paddw m1, m5 jmp satd_zmm_avx512_end INIT_YMM avx512 cglobal pixel_satd_4x8_internal vbroadcasti128 m4, [hmul_4p] mov r4d, 0x55550c kmovd k2, r4d ; 00001100 kshiftlb k3, k2, 2 ; 00110000 kshiftlb k4, k2, 4 ; 11000000 kshiftrd k1, k2, 8 ; 01010101 lea r4, [3*r1] lea r5, [3*r3] satd_4x8_avx512: SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6 satd_ymm_avx512: ; 1 1 3 3 5 5 7 7 SATD_AVX512_PACKED ret cglobal pixel_satd_8x4, 4,5 mova m4, [hmul_16p] mov r4d, 0x5555 kmovw k1, r4d SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0 call satd_ymm_avx512 ; 3 1 3 1 jmp satd_ymm_avx512_end2 cglobal pixel_satd_4x8, 4,6 call pixel_satd_4x8_internal_avx512 satd_ymm_avx512_end: %if ARCH_X86_64 == 0 pop r5d %assign regs_used 5 %endif satd_ymm_avx512_end2: SATD_AVX512_END cglobal pixel_satd_4x16, 4,6 call pixel_satd_4x8_internal_avx512 lea r0, [r0+4*r1] lea r2, [r2+4*r3] paddw m5, m0, m1 call satd_4x8_avx512 paddw m1, m5 jmp satd_ymm_avx512_end INIT_XMM avx512 cglobal pixel_satd_4x4, 4,5 mova m4, [hmul_4p] mov r4d, 0x550c kmovw k2, r4d kshiftrw k1, k2, 8 SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2 SATD_AVX512_PACKED ; 1 1 3 3 SWAP 0, 1 SATD_AVX512_END INIT_ZMM avx512 cglobal pixel_sa8d_8x8, 4,6 vbroadcasti64x4 m4, [hmul_16p] mov r4d, 0x55555555 kmovd k1, r4d ; 01010101 kshiftlb k2, k1, 5 ; 10100000 kshiftlb k3, k1, 4 ; 01010000 lea r4, [3*r1] lea r5, [3*r3] SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5 SUMSUB_BA w, 0, 1, 2 SBUTTERFLY qdq, 0, 1, 2 SUMSUB_BA w, 0, 1, 2 shufps m2, m0, m1, q2020 shufps m1, m0, m1, q3131 SUMSUB_BA w, 2, 1, 0 vshufi32x4 m0, m2, m1, q1010 vshufi32x4 m1, m2, m1, q3232 SUMSUB_BA w, 0, 1, 2 HMAXABSW2 0, 1, 2, 3 SATD_AVX512_END 1 %endif ; HIGH_BIT_DEPTH ;============================================================================= ; SSIM ;============================================================================= ;----------------------------------------------------------------------------- ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 %if HIGH_BIT_DEPTH movu m4, [r0+(%1&1)*r1] movu m5, [r2+(%1&1)*r3] %elif cpuflag(avx) pmovzxbw m4, [r0+(%1&1)*r1] pmovzxbw m5, [r2+(%1&1)*r3] %else movq m4, [r0+(%1&1)*r1] movq m5, [r2+(%1&1)*r3] punpcklbw m4, m7 punpcklbw m5, m7 %endif %if %1==1 lea r0, [r0+r1*2] lea r2, [r2+r3*2] %endif %if %1 == 0 && cpuflag(avx) SWAP 0, 4 SWAP 1, 5 pmaddwd m4, m0, m0 pmaddwd m5, m1, m1 pmaddwd m6, m0, m1 %else %if %1 == 0 mova m0, m4 mova m1, m5 %else paddw m0, m4 paddw m1, m5 %endif pmaddwd m6, m4, m5 pmaddwd m4, m4 pmaddwd m5, m5 %endif ACCUM paddd, 2, 4, %1 ACCUM paddd, 3, 6, %1 paddd m2, m5 %endmacro %macro SSIM 0 %if HIGH_BIT_DEPTH cglobal pixel_ssim_4x4x2_core, 4,4,7 FIX_STRIDES r1, r3 %else cglobal pixel_ssim_4x4x2_core, 4,4,7+notcpuflag(avx) %if notcpuflag(avx) pxor m7, m7 %endif %endif SSIM_ITER 0 SSIM_ITER 1 SSIM_ITER 2 SSIM_ITER 3 %if UNIX64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 0 mov t0, r4mp %endif %if cpuflag(ssse3) phaddw m0, m1 pmaddwd m0, [pw_1] phaddd m2, m3 %else mova m4, [pw_1] pmaddwd m0, m4 pmaddwd m1, m4 packssdw m0, m1 shufps m1, m2, m3, q2020 shufps m2, m3, q3131 pmaddwd m0, m4 paddd m2, m1 %endif shufps m1, m0, m2, q2020 shufps m0, m2, q3131 mova [t0], m1 mova [t0+16], m0 RET ;----------------------------------------------------------------------------- ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) ;----------------------------------------------------------------------------- cglobal pixel_ssim_end4, 2,3 mov r2d, r2m mova m0, [r0+ 0] mova m1, [r0+16] mova m2, [r0+32] mova m3, [r0+48] mova m4, [r0+64] paddd m0, [r1+ 0] paddd m1, [r1+16] paddd m2, [r1+32] paddd m3, [r1+48] paddd m4, [r1+64] paddd m0, m1 paddd m1, m2 paddd m2, m3 paddd m3, m4 TRANSPOSE4x4D 0, 1, 2, 3, 4 ; s1=m0, s2=m1, ss=m2, s12=m3 %if BIT_DEPTH == 10 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 cvtdq2ps m3, m3 mulps m4, m0, m1 ; s1*s2 mulps m0, m0 ; s1*s1 mulps m1, m1 ; s2*s2 mulps m2, [pf_64] ; ss*64 mulps m3, [pf_128] ; s12*128 addps m4, m4 ; s1*s2*2 addps m0, m1 ; s1*s1 + s2*s2 subps m2, m0 ; vars subps m3, m4 ; covar*2 movaps m1, [ssim_c1] addps m4, m1 ; s1*s2*2 + ssim_c1 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 movaps m1, [ssim_c2] addps m2, m1 ; vars + ssim_c2 addps m3, m1 ; covar*2 + ssim_c2 %else pmaddwd m4, m1, m0 ; s1*s2 pslld m1, 16 por m0, m1 pmaddwd m0, m0 ; s1*s1 + s2*s2 pslld m4, 1 pslld m3, 7 pslld m2, 6 psubd m3, m4 ; covar*2 psubd m2, m0 ; vars mova m1, [ssim_c1] paddd m0, m1 paddd m4, m1 mova m1, [ssim_c2] paddd m3, m1 paddd m2, m1 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) %endif mulps m4, m3 mulps m0, m2 divps m4, m0 ; ssim cmp r2d, 4 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level neg r2 %if ARCH_X86_64 lea r3, [mask_ff + 16] %xdefine %%mask r3 %else %xdefine %%mask mask_ff + 16 %endif %if cpuflag(avx) andps m4, [%%mask + r2*4] %else movups m0, [%%mask + r2*4] andps m4, m0 %endif .skip: movhlps m0, m4 addps m0, m4 %if cpuflag(ssse3) movshdup m4, m0 %else pshuflw m4, m0, q0032 %endif addss m0, m4 %if ARCH_X86_64 == 0 movss r0m, m0 fld dword r0m %endif RET %endmacro ; SSIM INIT_XMM sse2 SSIM INIT_XMM avx SSIM ;----------------------------------------------------------------------------- ; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); ;----------------------------------------------------------------------------- %macro ASD8 0 cglobal pixel_asd8, 5,5 pxor m0, m0 pxor m1, m1 .loop: %if HIGH_BIT_DEPTH paddw m0, [r0] paddw m1, [r2] paddw m0, [r0+2*r1] paddw m1, [r2+2*r3] lea r0, [r0+4*r1] paddw m0, [r0] paddw m1, [r2+4*r3] lea r2, [r2+4*r3] paddw m0, [r0+2*r1] paddw m1, [r2+2*r3] lea r0, [r0+4*r1] lea r2, [r2+4*r3] %else movq m2, [r0] movq m3, [r2] movhps m2, [r0+r1] movhps m3, [r2+r3] lea r0, [r0+2*r1] psadbw m2, m1 psadbw m3, m1 movq m4, [r0] movq m5, [r2+2*r3] lea r2, [r2+2*r3] movhps m4, [r0+r1] movhps m5, [r2+r3] lea r0, [r0+2*r1] paddw m0, m2 psubw m0, m3 psadbw m4, m1 psadbw m5, m1 lea r2, [r2+2*r3] paddw m0, m4 psubw m0, m5 %endif sub r4d, 4 jg .loop %if HIGH_BIT_DEPTH psubw m0, m1 HADDW m0, m1 ABSD m1, m0 %else MOVHL m1, m0 paddw m0, m1 ABSW m1, m0 %endif movd eax, m1 RET %endmacro INIT_XMM sse2 ASD8 INIT_XMM ssse3 ASD8 %if HIGH_BIT_DEPTH INIT_XMM xop ASD8 %endif ;============================================================================= ; Successive Elimination ADS ;============================================================================= %macro ADS_START 0 %if UNIX64 movsxd r5, r5d %else mov r5d, r5m %endif mov r0d, r5d lea r6, [r4+r5+(mmsize-1)] and r6, ~(mmsize-1) shl r2d, 1 %endmacro %macro ADS_END 1-2 .loop ; unroll_size, loop_label add r1, 2*%1 add r3, 2*%1 add r6, %1 sub r0d, %1 jg %2 WIN64_RESTORE_XMM_INTERNAL %if mmsize==32 vzeroupper %endif lea r6, [r4+r5+(mmsize-1)] and r6, ~(mmsize-1) %if cpuflag(ssse3) jmp ads_mvs_ssse3 %else jmp ads_mvs_mmx %endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro ADS_XMM 0 %if ARCH_X86_64 cglobal pixel_ads4, 5,7,9 %else cglobal pixel_ads4, 5,7,8 %endif %if mmsize >= 32 vpbroadcastd m7, [r0+ 0] vpbroadcastd m6, [r0+ 4] vpbroadcastd m5, [r0+ 8] vpbroadcastd m4, [r0+12] %else mova m4, [r0] pshufd m7, m4, 0 pshufd m6, m4, q1111 pshufd m5, m4, q2222 pshufd m4, m4, q3333 %endif %if ARCH_X86_64 SPLATD m8, r6m %endif ADS_START .loop: %if cpuflag(avx) pmovzxwd m0, [r1] pmovzxwd m1, [r1+16] %else movh m0, [r1] movh m1, [r1+16] pxor m3, m3 punpcklwd m0, m3 punpcklwd m1, m3 %endif psubd m0, m7 psubd m1, m6 ABSD m0, m0, m2 ABSD m1, m1, m3 %if cpuflag(avx) pmovzxwd m2, [r1+r2] pmovzxwd m3, [r1+r2+16] paddd m0, m1 %else movh m2, [r1+r2] movh m3, [r1+r2+16] paddd m0, m1 pxor m1, m1 punpcklwd m2, m1 punpcklwd m3, m1 %endif psubd m2, m5 psubd m3, m4 ABSD m2, m2, m1 ABSD m3, m3, m1 paddd m0, m2 paddd m0, m3 %if cpuflag(avx) pmovzxwd m1, [r3] %else movh m1, [r3] pxor m3, m3 punpcklwd m1, m3 %endif paddd m0, m1 %if ARCH_X86_64 psubd m1, m8, m0 %else SPLATD m1, r6m psubd m1, m0 %endif packssdw m1, m1 %if mmsize == 32 vpermq m1, m1, q3120 packuswb m1, m1 movq [r6], xm1 %else packuswb m1, m1 movd [r6], m1 %endif ADS_END mmsize/4 cglobal pixel_ads2, 5,7,8 %if mmsize >= 32 vpbroadcastd m7, [r0+0] vpbroadcastd m6, [r0+4] vpbroadcastd m5, r6m %else movq m6, [r0] movd m5, r6m pshufd m7, m6, 0 pshufd m6, m6, q1111 pshufd m5, m5, 0 %endif pxor m4, m4 ADS_START .loop: %if cpuflag(avx) pmovzxwd m0, [r1] pmovzxwd m1, [r1+r2] pmovzxwd m2, [r3] %else movh m0, [r1] movh m1, [r1+r2] movh m2, [r3] punpcklwd m0, m4 punpcklwd m1, m4 punpcklwd m2, m4 %endif psubd m0, m7 psubd m1, m6 ABSD m0, m0, m3 ABSD m1, m1, m3 paddd m0, m1 paddd m0, m2 psubd m1, m5, m0 packssdw m1, m1 %if mmsize == 32 vpermq m1, m1, q3120 packuswb m1, m1 movq [r6], xm1 %else packuswb m1, m1 movd [r6], m1 %endif ADS_END mmsize/4 cglobal pixel_ads1, 5,7,8 %if mmsize >= 32 vpbroadcastd m7, [r0] vpbroadcastd m6, r6m %else movd m7, [r0] movd m6, r6m pshufd m7, m7, 0 pshufd m6, m6, 0 %endif pxor m5, m5 ADS_START .loop: movu m1, [r1] movu m3, [r3] punpcklwd m0, m1, m5 punpckhwd m1, m5 punpcklwd m2, m3, m5 punpckhwd m3, m5 psubd m0, m7 psubd m1, m7 ABSD m0, m0, m4 ABSD m1, m1, m4 paddd m0, m2 paddd m1, m3 psubd m2, m6, m0 psubd m3, m6, m1 packssdw m2, m3 packuswb m2, m2 %if mmsize == 32 vpermq m2, m2, q3120 mova [r6], xm2 %else movq [r6], m2 %endif ADS_END mmsize/2 %endmacro INIT_XMM sse2 ADS_XMM INIT_XMM ssse3 ADS_XMM INIT_XMM avx ADS_XMM INIT_YMM avx2 ADS_XMM %else ; !HIGH_BIT_DEPTH %macro ADS_XMM 0 %if ARCH_X86_64 && mmsize == 16 cglobal pixel_ads4, 5,7,12 %elif ARCH_X86_64 && mmsize != 8 cglobal pixel_ads4, 5,7,9 %else cglobal pixel_ads4, 5,7,8 %endif test dword r6m, 0xffff0000 %if mmsize >= 32 vpbroadcastw m7, [r0+ 0] vpbroadcastw m6, [r0+ 4] vpbroadcastw m5, [r0+ 8] vpbroadcastw m4, [r0+12] %elif mmsize == 16 mova m4, [r0] pshuflw m7, m4, 0 pshuflw m6, m4, q2222 pshufhw m5, m4, 0 pshufhw m4, m4, q2222 punpcklqdq m7, m7 punpcklqdq m6, m6 punpckhqdq m5, m5 punpckhqdq m4, m4 %else mova m6, [r0] mova m4, [r0+8] pshufw m7, m6, 0 pshufw m6, m6, q2222 pshufw m5, m4, 0 pshufw m4, m4, q2222 %endif jnz .nz ADS_START %if ARCH_X86_64 && mmsize == 16 movu m10, [r1] movu m11, [r1+r2] SPLATW m8, r6m .loop: psubw m0, m10, m7 movu m10, [r1+16] psubw m1, m10, m6 ABSW m0, m0, m2 ABSW m1, m1, m3 psubw m2, m11, m5 movu m11, [r1+r2+16] paddw m0, m1 psubw m3, m11, m4 movu m9, [r3] ABSW m2, m2, m1 ABSW m3, m3, m1 paddw m0, m2 paddw m0, m3 paddusw m0, m9 psubusw m1, m8, m0 %else %if ARCH_X86_64 && mmsize != 8 SPLATW m8, r6m %endif .loop: movu m0, [r1] movu m1, [r1+16] psubw m0, m7 psubw m1, m6 ABSW m0, m0, m2 ABSW m1, m1, m3 movu m2, [r1+r2] movu m3, [r1+r2+16] psubw m2, m5 psubw m3, m4 paddw m0, m1 ABSW m2, m2, m1 ABSW m3, m3, m1 paddw m0, m2 paddw m0, m3 movu m2, [r3] %if ARCH_X86_64 && mmsize != 8 mova m1, m8 %else SPLATW m1, r6m %endif paddusw m0, m2 psubusw m1, m0 %endif ; ARCH packsswb m1, m1 %if mmsize == 32 vpermq m1, m1, q3120 mova [r6], xm1 %else movh [r6], m1 %endif ADS_END mmsize/2 .nz: ADS_START %if ARCH_X86_64 && mmsize == 16 movu m10, [r1] movu m11, [r1+r2] SPLATD m8, r6m .loop_nz: psubw m0, m10, m7 movu m10, [r1+16] psubw m1, m10, m6 ABSW m0, m0, m2 ABSW m1, m1, m3 psubw m2, m11, m5 movu m11, [r1+r2+16] paddw m0, m1 psubw m3, m11, m4 movu m9, [r3] ABSW m2, m2, m1 ABSW m3, m3, m1 paddw m0, m2 paddw m0, m3 pxor m3, m3 mova m2, m0 mova m1, m9 punpcklwd m0, m3 punpcklwd m9, m3 punpckhwd m2, m3 punpckhwd m1, m3 paddd m0, m9 paddd m2, m1 psubd m1, m8, m0 psubd m3, m8, m2 packssdw m1, m3 packuswb m1, m1 %else %if ARCH_X86_64 && mmsize != 8 SPLATD m8, r6m %endif .loop_nz: movu m0, [r1] movu m1, [r1+16] psubw m0, m7 psubw m1, m6 ABSW m0, m0, m2 ABSW m1, m1, m3 movu m2, [r1+r2] movu m3, [r1+r2+16] psubw m2, m5 psubw m3, m4 paddw m0, m1 ABSW m2, m2, m1 ABSW m3, m3, m1 paddw m0, m2 paddw m0, m3 %if mmsize == 32 movu m1, [r3] %else movh m1, [r3] %endif pxor m3, m3 mova m2, m0 punpcklwd m0, m3 punpcklwd m1, m3 punpckhwd m2, m3 paddd m0, m1 %if mmsize == 32 movu m1, [r3] punpckhwd m1, m3 %else movh m1, [r3+mmsize/2] punpcklwd m1, m3 %endif paddd m2, m1 %if ARCH_X86_64 && mmsize != 8 mova m1, m8 %else SPLATD m1, r6m %endif mova m3, m1 psubd m1, m0 psubd m3, m2 packssdw m1, m3 packuswb m1, m1 %endif ; ARCH %if mmsize == 32 vpermq m1, m1, q3120 mova [r6], xm1 %else movh [r6], m1 %endif ADS_END mmsize/2, .loop_nz cglobal pixel_ads2, 5,7,8 test dword r6m, 0xffff0000 %if mmsize >= 32 vpbroadcastw m7, [r0+0] vpbroadcastw m6, [r0+4] %elif mmsize == 16 movq m6, [r0] pshuflw m7, m6, 0 pshuflw m6, m6, q2222 punpcklqdq m7, m7 punpcklqdq m6, m6 %else mova m6, [r0] pshufw m7, m6, 0 pshufw m6, m6, q2222 %endif jnz .nz ADS_START SPLATW m5, r6m .loop: movu m0, [r1] movu m1, [r1+r2] movu m2, [r3] psubw m0, m7 psubw m1, m6 ABSW m0, m0, m3 ABSW m1, m1, m4 paddw m0, m1 paddusw m0, m2 psubusw m1, m5, m0 packsswb m1, m1 %if mmsize == 32 vpermq m1, m1, q3120 mova [r6], xm1 %else movh [r6], m1 %endif ADS_END mmsize/2 .nz: ADS_START SPLATD m5, r6m pxor m4, m4 .loop_nz: movu m0, [r1] movu m1, [r1+r2] movu m2, [r3] psubw m0, m7 psubw m1, m6 ABSW m0, m0, m3 ABSW m1, m1, m3 paddw m0, m1 punpckhwd m3, m2, m4 punpckhwd m1, m0, m4 punpcklwd m2, m4 punpcklwd m0, m4 paddd m1, m3 paddd m0, m2 psubd m3, m5, m1 psubd m2, m5, m0 packssdw m2, m3 packuswb m2, m2 %if mmsize == 32 vpermq m2, m2, q3120 mova [r6], xm2 %else movh [r6], m2 %endif ADS_END mmsize/2, .loop_nz cglobal pixel_ads1, 5,7,8 test dword r6m, 0xffff0000 SPLATW m7, [r0] jnz .nz ADS_START SPLATW m6, r6m .loop: movu m0, [r1] movu m1, [r1+mmsize] movu m2, [r3] movu m3, [r3+mmsize] psubw m0, m7 psubw m1, m7 ABSW m0, m0, m4 ABSW m1, m1, m5 paddusw m0, m2 paddusw m1, m3 psubusw m4, m6, m0 psubusw m5, m6, m1 packsswb m4, m5 %if mmsize == 32 vpermq m4, m4, q3120 %endif mova [r6], m4 ADS_END mmsize .nz: ADS_START SPLATD m6, r6m pxor m5, m5 .loop_nz: movu m0, [r1] movu m1, [r1+mmsize] movu m2, [r3] psubw m0, m7 psubw m1, m7 ABSW m0, m0, m3 ABSW m1, m1, m4 punpckhwd m3, m2, m5 punpckhwd m4, m0, m5 punpcklwd m2, m5 punpcklwd m0, m5 paddd m4, m3 paddd m0, m2 psubd m3, m6, m4 movu m4, [r3+mmsize] psubd m2, m6, m0 packssdw m2, m3 punpckhwd m0, m1, m5 punpckhwd m3, m4, m5 punpcklwd m1, m5 punpcklwd m4, m5 paddd m0, m3 paddd m1, m4 psubd m3, m6, m0 psubd m4, m6, m1 packssdw m4, m3 packuswb m2, m4 %if mmsize == 32 vpermq m2, m2, q3120 %endif mova [r6], m2 ADS_END mmsize, .loop_nz %endmacro INIT_MMX mmx2 ADS_XMM INIT_XMM sse2 ADS_XMM INIT_XMM ssse3 ADS_XMM INIT_XMM avx ADS_XMM INIT_YMM avx2 ADS_XMM %endif ; HIGH_BIT_DEPTH ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { ; int nmv=0, i, j; ; *(uint32_t*)(masks+width) = 0; ; for( i=0; i * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_PIXEL_H #define X264_X86_PIXEL_H #define x264_pixel_ads1_avx x264_template(pixel_ads1_avx) #define x264_pixel_ads1_avx2 x264_template(pixel_ads1_avx2) #define x264_pixel_ads1_mmx2 x264_template(pixel_ads1_mmx2) #define x264_pixel_ads1_sse2 x264_template(pixel_ads1_sse2) #define x264_pixel_ads1_ssse3 x264_template(pixel_ads1_ssse3) #define x264_pixel_ads2_avx x264_template(pixel_ads2_avx) #define x264_pixel_ads2_avx2 x264_template(pixel_ads2_avx2) #define x264_pixel_ads2_mmx2 x264_template(pixel_ads2_mmx2) #define x264_pixel_ads2_sse2 x264_template(pixel_ads2_sse2) #define x264_pixel_ads2_ssse3 x264_template(pixel_ads2_ssse3) #define x264_pixel_ads4_avx x264_template(pixel_ads4_avx) #define x264_pixel_ads4_avx2 x264_template(pixel_ads4_avx2) #define x264_pixel_ads4_mmx2 x264_template(pixel_ads4_mmx2) #define x264_pixel_ads4_sse2 x264_template(pixel_ads4_sse2) #define x264_pixel_ads4_ssse3 x264_template(pixel_ads4_ssse3) #define x264_pixel_hadamard_ac_16x16_avx x264_template(pixel_hadamard_ac_16x16_avx) #define x264_pixel_hadamard_ac_16x16_avx2 x264_template(pixel_hadamard_ac_16x16_avx2) #define x264_pixel_hadamard_ac_16x16_mmx2 x264_template(pixel_hadamard_ac_16x16_mmx2) #define x264_pixel_hadamard_ac_16x16_sse2 x264_template(pixel_hadamard_ac_16x16_sse2) #define x264_pixel_hadamard_ac_16x16_sse4 x264_template(pixel_hadamard_ac_16x16_sse4) #define x264_pixel_hadamard_ac_16x16_ssse3 x264_template(pixel_hadamard_ac_16x16_ssse3) #define x264_pixel_hadamard_ac_16x16_ssse3_atom x264_template(pixel_hadamard_ac_16x16_ssse3_atom) #define x264_pixel_hadamard_ac_16x16_xop x264_template(pixel_hadamard_ac_16x16_xop) #define x264_pixel_hadamard_ac_16x8_avx x264_template(pixel_hadamard_ac_16x8_avx) #define x264_pixel_hadamard_ac_16x8_avx2 x264_template(pixel_hadamard_ac_16x8_avx2) #define x264_pixel_hadamard_ac_16x8_mmx2 x264_template(pixel_hadamard_ac_16x8_mmx2) #define x264_pixel_hadamard_ac_16x8_sse2 x264_template(pixel_hadamard_ac_16x8_sse2) #define x264_pixel_hadamard_ac_16x8_sse4 x264_template(pixel_hadamard_ac_16x8_sse4) #define x264_pixel_hadamard_ac_16x8_ssse3 x264_template(pixel_hadamard_ac_16x8_ssse3) #define x264_pixel_hadamard_ac_16x8_ssse3_atom x264_template(pixel_hadamard_ac_16x8_ssse3_atom) #define x264_pixel_hadamard_ac_16x8_xop x264_template(pixel_hadamard_ac_16x8_xop) #define x264_pixel_hadamard_ac_8x16_avx x264_template(pixel_hadamard_ac_8x16_avx) #define x264_pixel_hadamard_ac_8x16_mmx2 x264_template(pixel_hadamard_ac_8x16_mmx2) #define x264_pixel_hadamard_ac_8x16_sse2 x264_template(pixel_hadamard_ac_8x16_sse2) #define x264_pixel_hadamard_ac_8x16_sse4 x264_template(pixel_hadamard_ac_8x16_sse4) #define x264_pixel_hadamard_ac_8x16_ssse3 x264_template(pixel_hadamard_ac_8x16_ssse3) #define x264_pixel_hadamard_ac_8x16_ssse3_atom x264_template(pixel_hadamard_ac_8x16_ssse3_atom) #define x264_pixel_hadamard_ac_8x16_xop x264_template(pixel_hadamard_ac_8x16_xop) #define x264_pixel_hadamard_ac_8x8_avx x264_template(pixel_hadamard_ac_8x8_avx) #define x264_pixel_hadamard_ac_8x8_mmx2 x264_template(pixel_hadamard_ac_8x8_mmx2) #define x264_pixel_hadamard_ac_8x8_sse2 x264_template(pixel_hadamard_ac_8x8_sse2) #define x264_pixel_hadamard_ac_8x8_sse4 x264_template(pixel_hadamard_ac_8x8_sse4) #define x264_pixel_hadamard_ac_8x8_ssse3 x264_template(pixel_hadamard_ac_8x8_ssse3) #define x264_pixel_hadamard_ac_8x8_ssse3_atom x264_template(pixel_hadamard_ac_8x8_ssse3_atom) #define x264_pixel_hadamard_ac_8x8_xop x264_template(pixel_hadamard_ac_8x8_xop) #define x264_pixel_sa8d_16x16_mmx2 x264_template(pixel_sa8d_16x16_mmx2) #define x264_pixel_sa8d_16x16_avx x264_template(pixel_sa8d_16x16_avx) #define x264_pixel_sa8d_16x16_sse2 x264_template(pixel_sa8d_16x16_sse2) #define x264_pixel_sa8d_16x16_sse4 x264_template(pixel_sa8d_16x16_sse4) #define x264_pixel_sa8d_16x16_ssse3 x264_template(pixel_sa8d_16x16_ssse3) #define x264_pixel_sa8d_16x16_ssse3_atom x264_template(pixel_sa8d_16x16_ssse3_atom) #define x264_pixel_sa8d_16x16_xop x264_template(pixel_sa8d_16x16_xop) #define x264_pixel_sa8d_8x8_mmx2 x264_template(pixel_sa8d_8x8_mmx2) #define x264_pixel_sa8d_8x8_avx x264_template(pixel_sa8d_8x8_avx) #define x264_pixel_sa8d_8x8_avx2 x264_template(pixel_sa8d_8x8_avx2) #define x264_pixel_sa8d_8x8_avx512 x264_template(pixel_sa8d_8x8_avx512) #define x264_pixel_sa8d_8x8_sse2 x264_template(pixel_sa8d_8x8_sse2) #define x264_pixel_sa8d_8x8_sse4 x264_template(pixel_sa8d_8x8_sse4) #define x264_pixel_sa8d_8x8_ssse3 x264_template(pixel_sa8d_8x8_ssse3) #define x264_pixel_sa8d_8x8_ssse3_atom x264_template(pixel_sa8d_8x8_ssse3_atom) #define x264_pixel_sa8d_8x8_xop x264_template(pixel_sa8d_8x8_xop) #define x264_pixel_sad_16x16_avx2 x264_template(pixel_sad_16x16_avx2) #define x264_pixel_sad_16x16_avx512 x264_template(pixel_sad_16x16_avx512) #define x264_pixel_sad_16x16_cache32_mmx2 x264_template(pixel_sad_16x16_cache32_mmx2) #define x264_pixel_sad_16x16_cache64_mmx2 x264_template(pixel_sad_16x16_cache64_mmx2) #define x264_pixel_sad_16x16_cache64_sse2 x264_template(pixel_sad_16x16_cache64_sse2) #define x264_pixel_sad_16x16_cache64_ssse3 x264_template(pixel_sad_16x16_cache64_ssse3) #define x264_pixel_sad_16x16_mmx2 x264_template(pixel_sad_16x16_mmx2) #define x264_pixel_sad_16x16_sse2 x264_template(pixel_sad_16x16_sse2) #define x264_pixel_sad_16x16_sse2_aligned x264_template(pixel_sad_16x16_sse2_aligned) #define x264_pixel_sad_16x16_sse3 x264_template(pixel_sad_16x16_sse3) #define x264_pixel_sad_16x16_ssse3 x264_template(pixel_sad_16x16_ssse3) #define x264_pixel_sad_16x16_ssse3_aligned x264_template(pixel_sad_16x16_ssse3_aligned) #define x264_pixel_sad_16x8_avx2 x264_template(pixel_sad_16x8_avx2) #define x264_pixel_sad_16x8_avx512 x264_template(pixel_sad_16x8_avx512) #define x264_pixel_sad_16x8_cache32_mmx2 x264_template(pixel_sad_16x8_cache32_mmx2) #define x264_pixel_sad_16x8_cache64_mmx2 x264_template(pixel_sad_16x8_cache64_mmx2) #define x264_pixel_sad_16x8_cache64_sse2 x264_template(pixel_sad_16x8_cache64_sse2) #define x264_pixel_sad_16x8_cache64_ssse3 x264_template(pixel_sad_16x8_cache64_ssse3) #define x264_pixel_sad_16x8_mmx2 x264_template(pixel_sad_16x8_mmx2) #define x264_pixel_sad_16x8_sse2 x264_template(pixel_sad_16x8_sse2) #define x264_pixel_sad_16x8_sse2_aligned x264_template(pixel_sad_16x8_sse2_aligned) #define x264_pixel_sad_16x8_sse3 x264_template(pixel_sad_16x8_sse3) #define x264_pixel_sad_16x8_ssse3 x264_template(pixel_sad_16x8_ssse3) #define x264_pixel_sad_16x8_ssse3_aligned x264_template(pixel_sad_16x8_ssse3_aligned) #define x264_pixel_sad_4x16_avx512 x264_template(pixel_sad_4x16_avx512) #define x264_pixel_sad_4x16_mmx2 x264_template(pixel_sad_4x16_mmx2) #define x264_pixel_sad_4x4_avx512 x264_template(pixel_sad_4x4_avx512) #define x264_pixel_sad_4x4_mmx2 x264_template(pixel_sad_4x4_mmx2) #define x264_pixel_sad_4x4_ssse3 x264_template(pixel_sad_4x4_ssse3) #define x264_pixel_sad_4x8_avx512 x264_template(pixel_sad_4x8_avx512) #define x264_pixel_sad_4x8_mmx2 x264_template(pixel_sad_4x8_mmx2) #define x264_pixel_sad_4x8_ssse3 x264_template(pixel_sad_4x8_ssse3) #define x264_pixel_sad_8x16_avx512 x264_template(pixel_sad_8x16_avx512) #define x264_pixel_sad_8x16_cache32_mmx2 x264_template(pixel_sad_8x16_cache32_mmx2) #define x264_pixel_sad_8x16_cache64_mmx2 x264_template(pixel_sad_8x16_cache64_mmx2) #define x264_pixel_sad_8x16_mmx2 x264_template(pixel_sad_8x16_mmx2) #define x264_pixel_sad_8x16_sse2 x264_template(pixel_sad_8x16_sse2) #define x264_pixel_sad_8x16_sse2_aligned x264_template(pixel_sad_8x16_sse2_aligned) #define x264_pixel_sad_8x16_ssse3 x264_template(pixel_sad_8x16_ssse3) #define x264_pixel_sad_8x16_ssse3_aligned x264_template(pixel_sad_8x16_ssse3_aligned) #define x264_pixel_sad_8x4_avx512 x264_template(pixel_sad_8x4_avx512) #define x264_pixel_sad_8x4_cache32_mmx2 x264_template(pixel_sad_8x4_cache32_mmx2) #define x264_pixel_sad_8x4_cache64_mmx2 x264_template(pixel_sad_8x4_cache64_mmx2) #define x264_pixel_sad_8x4_mmx2 x264_template(pixel_sad_8x4_mmx2) #define x264_pixel_sad_8x4_sse2 x264_template(pixel_sad_8x4_sse2) #define x264_pixel_sad_8x4_ssse3 x264_template(pixel_sad_8x4_ssse3) #define x264_pixel_sad_8x8_avx512 x264_template(pixel_sad_8x8_avx512) #define x264_pixel_sad_8x8_cache32_mmx2 x264_template(pixel_sad_8x8_cache32_mmx2) #define x264_pixel_sad_8x8_cache64_mmx2 x264_template(pixel_sad_8x8_cache64_mmx2) #define x264_pixel_sad_8x8_mmx2 x264_template(pixel_sad_8x8_mmx2) #define x264_pixel_sad_8x8_sse2 x264_template(pixel_sad_8x8_sse2) #define x264_pixel_sad_8x8_sse2_aligned x264_template(pixel_sad_8x8_sse2_aligned) #define x264_pixel_sad_8x8_ssse3 x264_template(pixel_sad_8x8_ssse3) #define x264_pixel_sad_8x8_ssse3_aligned x264_template(pixel_sad_8x8_ssse3_aligned) #define x264_pixel_sad_x3_16x16_avx x264_template(pixel_sad_x3_16x16_avx) #define x264_pixel_sad_x3_16x16_avx2 x264_template(pixel_sad_x3_16x16_avx2) #define x264_pixel_sad_x3_16x16_avx512 x264_template(pixel_sad_x3_16x16_avx512) #define x264_pixel_sad_x3_16x16_cache32_mmx2 x264_template(pixel_sad_x3_16x16_cache32_mmx2) #define x264_pixel_sad_x3_16x16_cache64_mmx2 x264_template(pixel_sad_x3_16x16_cache64_mmx2) #define x264_pixel_sad_x3_16x16_cache64_sse2 x264_template(pixel_sad_x3_16x16_cache64_sse2) #define x264_pixel_sad_x3_16x16_cache64_ssse3 x264_template(pixel_sad_x3_16x16_cache64_ssse3) #define x264_pixel_sad_x3_16x16_mmx2 x264_template(pixel_sad_x3_16x16_mmx2) #define x264_pixel_sad_x3_16x16_sse2 x264_template(pixel_sad_x3_16x16_sse2) #define x264_pixel_sad_x3_16x16_sse3 x264_template(pixel_sad_x3_16x16_sse3) #define x264_pixel_sad_x3_16x16_ssse3 x264_template(pixel_sad_x3_16x16_ssse3) #define x264_pixel_sad_x3_16x16_xop x264_template(pixel_sad_x3_16x16_xop) #define x264_pixel_sad_x3_16x8_avx x264_template(pixel_sad_x3_16x8_avx) #define x264_pixel_sad_x3_16x8_avx2 x264_template(pixel_sad_x3_16x8_avx2) #define x264_pixel_sad_x3_16x8_avx512 x264_template(pixel_sad_x3_16x8_avx512) #define x264_pixel_sad_x3_16x8_cache32_mmx2 x264_template(pixel_sad_x3_16x8_cache32_mmx2) #define x264_pixel_sad_x3_16x8_cache64_mmx2 x264_template(pixel_sad_x3_16x8_cache64_mmx2) #define x264_pixel_sad_x3_16x8_cache64_sse2 x264_template(pixel_sad_x3_16x8_cache64_sse2) #define x264_pixel_sad_x3_16x8_cache64_ssse3 x264_template(pixel_sad_x3_16x8_cache64_ssse3) #define x264_pixel_sad_x3_16x8_mmx2 x264_template(pixel_sad_x3_16x8_mmx2) #define x264_pixel_sad_x3_16x8_sse2 x264_template(pixel_sad_x3_16x8_sse2) #define x264_pixel_sad_x3_16x8_sse3 x264_template(pixel_sad_x3_16x8_sse3) #define x264_pixel_sad_x3_16x8_ssse3 x264_template(pixel_sad_x3_16x8_ssse3) #define x264_pixel_sad_x3_16x8_xop x264_template(pixel_sad_x3_16x8_xop) #define x264_pixel_sad_x3_4x4_avx512 x264_template(pixel_sad_x3_4x4_avx512) #define x264_pixel_sad_x3_4x4_mmx2 x264_template(pixel_sad_x3_4x4_mmx2) #define x264_pixel_sad_x3_4x4_ssse3 x264_template(pixel_sad_x3_4x4_ssse3) #define x264_pixel_sad_x3_4x8_avx512 x264_template(pixel_sad_x3_4x8_avx512) #define x264_pixel_sad_x3_4x8_mmx2 x264_template(pixel_sad_x3_4x8_mmx2) #define x264_pixel_sad_x3_4x8_ssse3 x264_template(pixel_sad_x3_4x8_ssse3) #define x264_pixel_sad_x3_8x16_avx512 x264_template(pixel_sad_x3_8x16_avx512) #define x264_pixel_sad_x3_8x16_cache32_mmx2 x264_template(pixel_sad_x3_8x16_cache32_mmx2) #define x264_pixel_sad_x3_8x16_cache64_mmx2 x264_template(pixel_sad_x3_8x16_cache64_mmx2) #define x264_pixel_sad_x3_8x16_cache64_sse2 x264_template(pixel_sad_x3_8x16_cache64_sse2) #define x264_pixel_sad_x3_8x16_mmx2 x264_template(pixel_sad_x3_8x16_mmx2) #define x264_pixel_sad_x3_8x16_sse2 x264_template(pixel_sad_x3_8x16_sse2) #define x264_pixel_sad_x3_8x16_ssse3 x264_template(pixel_sad_x3_8x16_ssse3) #define x264_pixel_sad_x3_8x16_xop x264_template(pixel_sad_x3_8x16_xop) #define x264_pixel_sad_x3_8x4_avx512 x264_template(pixel_sad_x3_8x4_avx512) #define x264_pixel_sad_x3_8x4_mmx2 x264_template(pixel_sad_x3_8x4_mmx2) #define x264_pixel_sad_x3_8x4_sse2 x264_template(pixel_sad_x3_8x4_sse2) #define x264_pixel_sad_x3_8x4_ssse3 x264_template(pixel_sad_x3_8x4_ssse3) #define x264_pixel_sad_x3_8x4_xop x264_template(pixel_sad_x3_8x4_xop) #define x264_pixel_sad_x3_8x8_avx512 x264_template(pixel_sad_x3_8x8_avx512) #define x264_pixel_sad_x3_8x8_cache32_mmx2 x264_template(pixel_sad_x3_8x8_cache32_mmx2) #define x264_pixel_sad_x3_8x8_cache64_mmx2 x264_template(pixel_sad_x3_8x8_cache64_mmx2) #define x264_pixel_sad_x3_8x8_mmx2 x264_template(pixel_sad_x3_8x8_mmx2) #define x264_pixel_sad_x3_8x8_sse2 x264_template(pixel_sad_x3_8x8_sse2) #define x264_pixel_sad_x3_8x8_ssse3 x264_template(pixel_sad_x3_8x8_ssse3) #define x264_pixel_sad_x3_8x8_xop x264_template(pixel_sad_x3_8x8_xop) #define x264_pixel_sad_x4_16x16_avx x264_template(pixel_sad_x4_16x16_avx) #define x264_pixel_sad_x4_16x16_avx2 x264_template(pixel_sad_x4_16x16_avx2) #define x264_pixel_sad_x4_16x16_avx512 x264_template(pixel_sad_x4_16x16_avx512) #define x264_pixel_sad_x4_16x16_cache32_mmx2 x264_template(pixel_sad_x4_16x16_cache32_mmx2) #define x264_pixel_sad_x4_16x16_cache64_mmx2 x264_template(pixel_sad_x4_16x16_cache64_mmx2) #define x264_pixel_sad_x4_16x16_cache64_sse2 x264_template(pixel_sad_x4_16x16_cache64_sse2) #define x264_pixel_sad_x4_16x16_cache64_ssse3 x264_template(pixel_sad_x4_16x16_cache64_ssse3) #define x264_pixel_sad_x4_16x16_mmx2 x264_template(pixel_sad_x4_16x16_mmx2) #define x264_pixel_sad_x4_16x16_sse2 x264_template(pixel_sad_x4_16x16_sse2) #define x264_pixel_sad_x4_16x16_sse3 x264_template(pixel_sad_x4_16x16_sse3) #define x264_pixel_sad_x4_16x16_ssse3 x264_template(pixel_sad_x4_16x16_ssse3) #define x264_pixel_sad_x4_16x16_xop x264_template(pixel_sad_x4_16x16_xop) #define x264_pixel_sad_x4_16x8_avx x264_template(pixel_sad_x4_16x8_avx) #define x264_pixel_sad_x4_16x8_avx2 x264_template(pixel_sad_x4_16x8_avx2) #define x264_pixel_sad_x4_16x8_avx512 x264_template(pixel_sad_x4_16x8_avx512) #define x264_pixel_sad_x4_16x8_cache32_mmx2 x264_template(pixel_sad_x4_16x8_cache32_mmx2) #define x264_pixel_sad_x4_16x8_cache64_mmx2 x264_template(pixel_sad_x4_16x8_cache64_mmx2) #define x264_pixel_sad_x4_16x8_cache64_sse2 x264_template(pixel_sad_x4_16x8_cache64_sse2) #define x264_pixel_sad_x4_16x8_cache64_ssse3 x264_template(pixel_sad_x4_16x8_cache64_ssse3) #define x264_pixel_sad_x4_16x8_mmx2 x264_template(pixel_sad_x4_16x8_mmx2) #define x264_pixel_sad_x4_16x8_sse2 x264_template(pixel_sad_x4_16x8_sse2) #define x264_pixel_sad_x4_16x8_sse3 x264_template(pixel_sad_x4_16x8_sse3) #define x264_pixel_sad_x4_16x8_ssse3 x264_template(pixel_sad_x4_16x8_ssse3) #define x264_pixel_sad_x4_16x8_xop x264_template(pixel_sad_x4_16x8_xop) #define x264_pixel_sad_x4_4x4_avx512 x264_template(pixel_sad_x4_4x4_avx512) #define x264_pixel_sad_x4_4x4_mmx2 x264_template(pixel_sad_x4_4x4_mmx2) #define x264_pixel_sad_x4_4x4_ssse3 x264_template(pixel_sad_x4_4x4_ssse3) #define x264_pixel_sad_x4_4x8_avx512 x264_template(pixel_sad_x4_4x8_avx512) #define x264_pixel_sad_x4_4x8_mmx2 x264_template(pixel_sad_x4_4x8_mmx2) #define x264_pixel_sad_x4_4x8_ssse3 x264_template(pixel_sad_x4_4x8_ssse3) #define x264_pixel_sad_x4_8x16_avx512 x264_template(pixel_sad_x4_8x16_avx512) #define x264_pixel_sad_x4_8x16_cache32_mmx2 x264_template(pixel_sad_x4_8x16_cache32_mmx2) #define x264_pixel_sad_x4_8x16_cache64_mmx2 x264_template(pixel_sad_x4_8x16_cache64_mmx2) #define x264_pixel_sad_x4_8x16_cache64_sse2 x264_template(pixel_sad_x4_8x16_cache64_sse2) #define x264_pixel_sad_x4_8x16_mmx2 x264_template(pixel_sad_x4_8x16_mmx2) #define x264_pixel_sad_x4_8x16_sse2 x264_template(pixel_sad_x4_8x16_sse2) #define x264_pixel_sad_x4_8x16_ssse3 x264_template(pixel_sad_x4_8x16_ssse3) #define x264_pixel_sad_x4_8x16_xop x264_template(pixel_sad_x4_8x16_xop) #define x264_pixel_sad_x4_8x4_avx512 x264_template(pixel_sad_x4_8x4_avx512) #define x264_pixel_sad_x4_8x4_mmx2 x264_template(pixel_sad_x4_8x4_mmx2) #define x264_pixel_sad_x4_8x4_sse2 x264_template(pixel_sad_x4_8x4_sse2) #define x264_pixel_sad_x4_8x4_ssse3 x264_template(pixel_sad_x4_8x4_ssse3) #define x264_pixel_sad_x4_8x4_xop x264_template(pixel_sad_x4_8x4_xop) #define x264_pixel_sad_x4_8x8_avx512 x264_template(pixel_sad_x4_8x8_avx512) #define x264_pixel_sad_x4_8x8_cache32_mmx2 x264_template(pixel_sad_x4_8x8_cache32_mmx2) #define x264_pixel_sad_x4_8x8_cache64_mmx2 x264_template(pixel_sad_x4_8x8_cache64_mmx2) #define x264_pixel_sad_x4_8x8_mmx2 x264_template(pixel_sad_x4_8x8_mmx2) #define x264_pixel_sad_x4_8x8_sse2 x264_template(pixel_sad_x4_8x8_sse2) #define x264_pixel_sad_x4_8x8_ssse3 x264_template(pixel_sad_x4_8x8_ssse3) #define x264_pixel_sad_x4_8x8_xop x264_template(pixel_sad_x4_8x8_xop) #define x264_pixel_satd_16x16_avx x264_template(pixel_satd_16x16_avx) #define x264_pixel_satd_16x16_avx2 x264_template(pixel_satd_16x16_avx2) #define x264_pixel_satd_16x16_avx512 x264_template(pixel_satd_16x16_avx512) #define x264_pixel_satd_16x16_mmx2 x264_template(pixel_satd_16x16_mmx2) #define x264_pixel_satd_16x16_sse2 x264_template(pixel_satd_16x16_sse2) #define x264_pixel_satd_16x16_sse4 x264_template(pixel_satd_16x16_sse4) #define x264_pixel_satd_16x16_ssse3 x264_template(pixel_satd_16x16_ssse3) #define x264_pixel_satd_16x16_ssse3_atom x264_template(pixel_satd_16x16_ssse3_atom) #define x264_pixel_satd_16x16_xop x264_template(pixel_satd_16x16_xop) #define x264_pixel_satd_16x8_avx x264_template(pixel_satd_16x8_avx) #define x264_pixel_satd_16x8_avx2 x264_template(pixel_satd_16x8_avx2) #define x264_pixel_satd_16x8_avx512 x264_template(pixel_satd_16x8_avx512) #define x264_pixel_satd_16x8_mmx2 x264_template(pixel_satd_16x8_mmx2) #define x264_pixel_satd_16x8_sse2 x264_template(pixel_satd_16x8_sse2) #define x264_pixel_satd_16x8_sse4 x264_template(pixel_satd_16x8_sse4) #define x264_pixel_satd_16x8_ssse3 x264_template(pixel_satd_16x8_ssse3) #define x264_pixel_satd_16x8_ssse3_atom x264_template(pixel_satd_16x8_ssse3_atom) #define x264_pixel_satd_16x8_xop x264_template(pixel_satd_16x8_xop) #define x264_pixel_satd_4x16_avx x264_template(pixel_satd_4x16_avx) #define x264_pixel_satd_4x16_avx512 x264_template(pixel_satd_4x16_avx512) #define x264_pixel_satd_4x16_mmx2 x264_template(pixel_satd_4x16_mmx2) #define x264_pixel_satd_4x16_sse2 x264_template(pixel_satd_4x16_sse2) #define x264_pixel_satd_4x16_sse4 x264_template(pixel_satd_4x16_sse4) #define x264_pixel_satd_4x16_ssse3 x264_template(pixel_satd_4x16_ssse3) #define x264_pixel_satd_4x16_ssse3_atom x264_template(pixel_satd_4x16_ssse3_atom) #define x264_pixel_satd_4x4_avx x264_template(pixel_satd_4x4_avx) #define x264_pixel_satd_4x4_avx512 x264_template(pixel_satd_4x4_avx512) #define x264_pixel_satd_4x4_mmx2 x264_template(pixel_satd_4x4_mmx2) #define x264_pixel_satd_4x4_sse4 x264_template(pixel_satd_4x4_sse4) #define x264_pixel_satd_4x4_ssse3 x264_template(pixel_satd_4x4_ssse3) #define x264_pixel_satd_4x4_xop x264_template(pixel_satd_4x4_xop) #define x264_pixel_satd_4x8_avx x264_template(pixel_satd_4x8_avx) #define x264_pixel_satd_4x8_avx512 x264_template(pixel_satd_4x8_avx512) #define x264_pixel_satd_4x8_mmx2 x264_template(pixel_satd_4x8_mmx2) #define x264_pixel_satd_4x8_sse2 x264_template(pixel_satd_4x8_sse2) #define x264_pixel_satd_4x8_sse4 x264_template(pixel_satd_4x8_sse4) #define x264_pixel_satd_4x8_ssse3 x264_template(pixel_satd_4x8_ssse3) #define x264_pixel_satd_4x8_ssse3_atom x264_template(pixel_satd_4x8_ssse3_atom) #define x264_pixel_satd_4x8_xop x264_template(pixel_satd_4x8_xop) #define x264_pixel_satd_8x16_avx x264_template(pixel_satd_8x16_avx) #define x264_pixel_satd_8x16_avx2 x264_template(pixel_satd_8x16_avx2) #define x264_pixel_satd_8x16_avx512 x264_template(pixel_satd_8x16_avx512) #define x264_pixel_satd_8x16_mmx2 x264_template(pixel_satd_8x16_mmx2) #define x264_pixel_satd_8x16_sse2 x264_template(pixel_satd_8x16_sse2) #define x264_pixel_satd_8x16_sse4 x264_template(pixel_satd_8x16_sse4) #define x264_pixel_satd_8x16_ssse3 x264_template(pixel_satd_8x16_ssse3) #define x264_pixel_satd_8x16_ssse3_atom x264_template(pixel_satd_8x16_ssse3_atom) #define x264_pixel_satd_8x16_xop x264_template(pixel_satd_8x16_xop) #define x264_pixel_satd_8x4_avx x264_template(pixel_satd_8x4_avx) #define x264_pixel_satd_8x4_avx512 x264_template(pixel_satd_8x4_avx512) #define x264_pixel_satd_8x4_mmx2 x264_template(pixel_satd_8x4_mmx2) #define x264_pixel_satd_8x4_sse2 x264_template(pixel_satd_8x4_sse2) #define x264_pixel_satd_8x4_sse4 x264_template(pixel_satd_8x4_sse4) #define x264_pixel_satd_8x4_ssse3 x264_template(pixel_satd_8x4_ssse3) #define x264_pixel_satd_8x4_ssse3_atom x264_template(pixel_satd_8x4_ssse3_atom) #define x264_pixel_satd_8x4_xop x264_template(pixel_satd_8x4_xop) #define x264_pixel_satd_8x8_avx x264_template(pixel_satd_8x8_avx) #define x264_pixel_satd_8x8_avx2 x264_template(pixel_satd_8x8_avx2) #define x264_pixel_satd_8x8_avx512 x264_template(pixel_satd_8x8_avx512) #define x264_pixel_satd_8x8_mmx2 x264_template(pixel_satd_8x8_mmx2) #define x264_pixel_satd_8x8_sse2 x264_template(pixel_satd_8x8_sse2) #define x264_pixel_satd_8x8_sse4 x264_template(pixel_satd_8x8_sse4) #define x264_pixel_satd_8x8_ssse3 x264_template(pixel_satd_8x8_ssse3) #define x264_pixel_satd_8x8_ssse3_atom x264_template(pixel_satd_8x8_ssse3_atom) #define x264_pixel_satd_8x8_xop x264_template(pixel_satd_8x8_xop) #define x264_pixel_ssd_16x16_avx x264_template(pixel_ssd_16x16_avx) #define x264_pixel_ssd_16x16_avx2 x264_template(pixel_ssd_16x16_avx2) #define x264_pixel_ssd_16x16_mmx x264_template(pixel_ssd_16x16_mmx) #define x264_pixel_ssd_16x16_mmx2 x264_template(pixel_ssd_16x16_mmx2) #define x264_pixel_ssd_16x16_sse2 x264_template(pixel_ssd_16x16_sse2) #define x264_pixel_ssd_16x16_sse2slow x264_template(pixel_ssd_16x16_sse2slow) #define x264_pixel_ssd_16x16_ssse3 x264_template(pixel_ssd_16x16_ssse3) #define x264_pixel_ssd_16x16_xop x264_template(pixel_ssd_16x16_xop) #define x264_pixel_ssd_16x8_avx x264_template(pixel_ssd_16x8_avx) #define x264_pixel_ssd_16x8_avx2 x264_template(pixel_ssd_16x8_avx2) #define x264_pixel_ssd_16x8_mmx x264_template(pixel_ssd_16x8_mmx) #define x264_pixel_ssd_16x8_mmx2 x264_template(pixel_ssd_16x8_mmx2) #define x264_pixel_ssd_16x8_sse2 x264_template(pixel_ssd_16x8_sse2) #define x264_pixel_ssd_16x8_sse2slow x264_template(pixel_ssd_16x8_sse2slow) #define x264_pixel_ssd_16x8_ssse3 x264_template(pixel_ssd_16x8_ssse3) #define x264_pixel_ssd_16x8_xop x264_template(pixel_ssd_16x8_xop) #define x264_pixel_ssd_4x16_mmx x264_template(pixel_ssd_4x16_mmx) #define x264_pixel_ssd_4x16_mmx2 x264_template(pixel_ssd_4x16_mmx2) #define x264_pixel_ssd_4x16_ssse3 x264_template(pixel_ssd_4x16_ssse3) #define x264_pixel_ssd_4x4_mmx x264_template(pixel_ssd_4x4_mmx) #define x264_pixel_ssd_4x4_mmx2 x264_template(pixel_ssd_4x4_mmx2) #define x264_pixel_ssd_4x4_ssse3 x264_template(pixel_ssd_4x4_ssse3) #define x264_pixel_ssd_4x8_mmx x264_template(pixel_ssd_4x8_mmx) #define x264_pixel_ssd_4x8_mmx2 x264_template(pixel_ssd_4x8_mmx2) #define x264_pixel_ssd_4x8_ssse3 x264_template(pixel_ssd_4x8_ssse3) #define x264_pixel_ssd_8x16_avx x264_template(pixel_ssd_8x16_avx) #define x264_pixel_ssd_8x16_mmx x264_template(pixel_ssd_8x16_mmx) #define x264_pixel_ssd_8x16_mmx2 x264_template(pixel_ssd_8x16_mmx2) #define x264_pixel_ssd_8x16_sse2 x264_template(pixel_ssd_8x16_sse2) #define x264_pixel_ssd_8x16_sse2slow x264_template(pixel_ssd_8x16_sse2slow) #define x264_pixel_ssd_8x16_ssse3 x264_template(pixel_ssd_8x16_ssse3) #define x264_pixel_ssd_8x16_xop x264_template(pixel_ssd_8x16_xop) #define x264_pixel_ssd_8x4_avx x264_template(pixel_ssd_8x4_avx) #define x264_pixel_ssd_8x4_mmx x264_template(pixel_ssd_8x4_mmx) #define x264_pixel_ssd_8x4_mmx2 x264_template(pixel_ssd_8x4_mmx2) #define x264_pixel_ssd_8x4_sse2 x264_template(pixel_ssd_8x4_sse2) #define x264_pixel_ssd_8x4_sse2slow x264_template(pixel_ssd_8x4_sse2slow) #define x264_pixel_ssd_8x4_ssse3 x264_template(pixel_ssd_8x4_ssse3) #define x264_pixel_ssd_8x4_xop x264_template(pixel_ssd_8x4_xop) #define x264_pixel_ssd_8x8_avx x264_template(pixel_ssd_8x8_avx) #define x264_pixel_ssd_8x8_mmx x264_template(pixel_ssd_8x8_mmx) #define x264_pixel_ssd_8x8_mmx2 x264_template(pixel_ssd_8x8_mmx2) #define x264_pixel_ssd_8x8_sse2 x264_template(pixel_ssd_8x8_sse2) #define x264_pixel_ssd_8x8_sse2slow x264_template(pixel_ssd_8x8_sse2slow) #define x264_pixel_ssd_8x8_ssse3 x264_template(pixel_ssd_8x8_ssse3) #define x264_pixel_ssd_8x8_xop x264_template(pixel_ssd_8x8_xop) #define x264_pixel_var_16x16_avx x264_template(pixel_var_16x16_avx) #define x264_pixel_var_16x16_avx2 x264_template(pixel_var_16x16_avx2) #define x264_pixel_var_16x16_avx512 x264_template(pixel_var_16x16_avx512) #define x264_pixel_var_16x16_sse2 x264_template(pixel_var_16x16_sse2) #define x264_pixel_var_8x16_avx x264_template(pixel_var_8x16_avx) #define x264_pixel_var_8x16_avx512 x264_template(pixel_var_8x16_avx512) #define x264_pixel_var_8x16_sse2 x264_template(pixel_var_8x16_sse2) #define x264_pixel_var_8x8_avx x264_template(pixel_var_8x8_avx) #define x264_pixel_var_8x8_avx512 x264_template(pixel_var_8x8_avx512) #define x264_pixel_var_8x8_sse2 x264_template(pixel_var_8x8_sse2) #define DECL_PIXELS( ret, name, suffix, args ) \ ret x264_pixel_##name##_16x16_##suffix args;\ ret x264_pixel_##name##_16x8_##suffix args;\ ret x264_pixel_##name##_8x16_##suffix args;\ ret x264_pixel_##name##_8x8_##suffix args;\ ret x264_pixel_##name##_8x4_##suffix args;\ ret x264_pixel_##name##_4x16_##suffix args;\ ret x264_pixel_##name##_4x8_##suffix args;\ ret x264_pixel_##name##_4x4_##suffix args;\ #define DECL_X1( name, suffix ) \ DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) ) #define DECL_X4( name, suffix ) \ DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\ DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) ) DECL_X1( sad, mmx2 ) DECL_X1( sad, sse2 ) DECL_X1( sad, sse3 ) DECL_X1( sad, sse2_aligned ) DECL_X1( sad, ssse3 ) DECL_X1( sad, ssse3_aligned ) DECL_X1( sad, avx2 ) DECL_X1( sad, avx512 ) DECL_X4( sad, mmx2 ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) DECL_X4( sad, xop ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) DECL_X4( sad, avx512 ) DECL_X1( ssd, mmx ) DECL_X1( ssd, mmx2 ) DECL_X1( ssd, sse2slow ) DECL_X1( ssd, sse2 ) DECL_X1( ssd, ssse3 ) DECL_X1( ssd, avx ) DECL_X1( ssd, xop ) DECL_X1( ssd, avx2 ) DECL_X1( satd, mmx2 ) DECL_X1( satd, sse2 ) DECL_X1( satd, ssse3 ) DECL_X1( satd, ssse3_atom ) DECL_X1( satd, sse4 ) DECL_X1( satd, avx ) DECL_X1( satd, xop ) DECL_X1( satd, avx2 ) DECL_X1( satd, avx512 ) DECL_X1( sa8d, mmx2 ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) DECL_X1( sa8d, ssse3_atom ) DECL_X1( sa8d, sse4 ) DECL_X1( sa8d, avx ) DECL_X1( sa8d, xop ) DECL_X1( sa8d, avx2 ) DECL_X1( sa8d, avx512 ) DECL_X1( sad, cache32_mmx2 ); DECL_X1( sad, cache64_mmx2 ); DECL_X1( sad, cache64_sse2 ); DECL_X1( sad, cache64_ssse3 ); DECL_X4( sad, cache32_mmx2 ); DECL_X4( sad, cache64_mmx2 ); DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, avx2, ( pixel *pix, intptr_t i_stride )) #define x264_intra_satd_x3_4x4_mmx2 x264_template(intra_satd_x3_4x4_mmx2) void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * ); #define x264_intra_sad_x3_4x4_mmx2 x264_template(intra_sad_x3_4x4_mmx2) void x264_intra_sad_x3_4x4_mmx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_4x4_sse2 x264_template(intra_sad_x3_4x4_sse2) void x264_intra_sad_x3_4x4_sse2 ( uint16_t*, uint16_t*, int * ); #define x264_intra_sad_x3_4x4_ssse3 x264_template(intra_sad_x3_4x4_ssse3) void x264_intra_sad_x3_4x4_ssse3 ( uint16_t*, uint16_t*, int * ); #define x264_intra_sad_x3_4x4_avx x264_template(intra_sad_x3_4x4_avx) void x264_intra_sad_x3_4x4_avx ( uint16_t*, uint16_t*, int * ); #define x264_intra_satd_x3_8x8c_mmx2 x264_template(intra_satd_x3_8x8c_mmx2) void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * ); #define x264_intra_satd_x3_8x8c_ssse3 x264_template(intra_satd_x3_8x8c_ssse3) void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_8x8c_mmx2 x264_template(intra_sad_x3_8x8c_mmx2) void x264_intra_sad_x3_8x8c_mmx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_8x8c_ssse3 x264_template(intra_sad_x3_8x8c_ssse3) void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_8x8c_avx2 x264_template(intra_sad_x3_8x8c_avx2) void x264_intra_sad_x3_8x8c_avx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_satd_x3_16x16_mmx2 x264_template(intra_satd_x3_16x16_mmx2) void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * ); #define x264_intra_satd_x3_16x16_ssse3 x264_template(intra_satd_x3_16x16_ssse3) void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_16x16_mmx2 x264_template(intra_sad_x3_16x16_mmx2) void x264_intra_sad_x3_16x16_mmx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_16x16_sse2 x264_template(intra_sad_x3_16x16_sse2) void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_16x16_ssse3 x264_template(intra_sad_x3_16x16_ssse3) void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_16x16_avx2 x264_template(intra_sad_x3_16x16_avx2) void x264_intra_sad_x3_16x16_avx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sa8d_x3_8x8_mmx2 x264_template(intra_sa8d_x3_8x8_mmx2) void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sa8d_x3_8x8_sse2 x264_template(intra_sa8d_x3_8x8_sse2) void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_8x8_mmx2 x264_template(intra_sad_x3_8x8_mmx2) void x264_intra_sad_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * ); #define x264_intra_sad_x3_8x8_sse2 x264_template(intra_sad_x3_8x8_sse2) void x264_intra_sad_x3_8x8_sse2 ( uint16_t*, uint16_t*, int * ); #define x264_intra_sad_x3_8x8_ssse3 x264_template(intra_sad_x3_8x8_ssse3) void x264_intra_sad_x3_8x8_ssse3 ( uint16_t*, uint16_t*, int * ); #define x264_intra_sad_x3_8x8_avx2 x264_template(intra_sad_x3_8x8_avx2) void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * ); #define x264_intra_satd_x9_4x4_ssse3 x264_template(intra_satd_x9_4x4_ssse3) int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_satd_x9_4x4_sse4 x264_template(intra_satd_x9_4x4_sse4) int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_satd_x9_4x4_avx x264_template(intra_satd_x9_4x4_avx) int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_satd_x9_4x4_xop x264_template(intra_satd_x9_4x4_xop) int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_sad_x9_4x4_ssse3 x264_template(intra_sad_x9_4x4_ssse3) int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_sad_x9_4x4_sse4 x264_template(intra_sad_x9_4x4_sse4) int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_sad_x9_4x4_avx x264_template(intra_sad_x9_4x4_avx) int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * ); #define x264_intra_sa8d_x9_8x8_ssse3 x264_template(intra_sa8d_x9_8x8_ssse3) int x264_intra_sa8d_x9_8x8_ssse3( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_intra_sa8d_x9_8x8_sse4 x264_template(intra_sa8d_x9_8x8_sse4) int x264_intra_sa8d_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_intra_sa8d_x9_8x8_avx x264_template(intra_sa8d_x9_8x8_avx) int x264_intra_sa8d_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_intra_sad_x9_8x8_ssse3 x264_template(intra_sad_x9_8x8_ssse3) int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_intra_sad_x9_8x8_sse4 x264_template(intra_sad_x9_8x8_sse4) int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_intra_sad_x9_8x8_avx x264_template(intra_sad_x9_8x8_avx) int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_intra_sad_x9_8x8_avx2 x264_template(intra_sad_x9_8x8_avx2) int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); #define x264_pixel_ssd_nv12_core_sse2 x264_template(pixel_ssd_nv12_core_sse2) void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); #define x264_pixel_ssd_nv12_core_avx x264_template(pixel_ssd_nv12_core_avx) void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); #define x264_pixel_ssd_nv12_core_xop x264_template(pixel_ssd_nv12_core_xop) void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); #define x264_pixel_ssd_nv12_core_avx2 x264_template(pixel_ssd_nv12_core_avx2) void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); #define x264_pixel_ssim_4x4x2_core_mmx2 x264_template(pixel_ssim_4x4x2_core_mmx2) void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1, const uint8_t *pix2, intptr_t stride2, int sums[2][4] ); #define x264_pixel_ssim_4x4x2_core_sse2 x264_template(pixel_ssim_4x4x2_core_sse2) void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4] ); #define x264_pixel_ssim_4x4x2_core_avx x264_template(pixel_ssim_4x4x2_core_avx) void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4] ); #define x264_pixel_ssim_end4_sse2 x264_template(pixel_ssim_end4_sse2) float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); #define x264_pixel_ssim_end4_avx x264_template(pixel_ssim_end4_avx) float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width ); #define x264_pixel_var2_8x8_sse2 x264_template(pixel_var2_8x8_sse2) int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] ); #define x264_pixel_var2_8x8_ssse3 x264_template(pixel_var2_8x8_ssse3) int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] ); #define x264_pixel_var2_8x8_avx2 x264_template(pixel_var2_8x8_avx2) int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] ); #define x264_pixel_var2_8x8_avx512 x264_template(pixel_var2_8x8_avx512) int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] ); #define x264_pixel_var2_8x16_sse2 x264_template(pixel_var2_8x16_sse2) int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] ); #define x264_pixel_var2_8x16_ssse3 x264_template(pixel_var2_8x16_ssse3) int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] ); #define x264_pixel_var2_8x16_avx2 x264_template(pixel_var2_8x16_avx2) int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] ); #define x264_pixel_var2_8x16_avx512 x264_template(pixel_var2_8x16_avx512) int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] ); #define x264_pixel_vsad_mmx2 x264_template(pixel_vsad_mmx2) int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); #define x264_pixel_vsad_sse2 x264_template(pixel_vsad_sse2) int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); #define x264_pixel_vsad_ssse3 x264_template(pixel_vsad_ssse3) int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); #define x264_pixel_vsad_xop x264_template(pixel_vsad_xop) int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height ); #define x264_pixel_vsad_avx2 x264_template(pixel_vsad_avx2) int x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height ); #define x264_pixel_asd8_sse2 x264_template(pixel_asd8_sse2) int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); #define x264_pixel_asd8_ssse3 x264_template(pixel_asd8_ssse3) int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); #define x264_pixel_asd8_xop x264_template(pixel_asd8_xop) int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); #define x264_pixel_sa8d_satd_16x16_sse2 x264_template(pixel_sa8d_satd_16x16_sse2) uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define x264_pixel_sa8d_satd_16x16_ssse3 x264_template(pixel_sa8d_satd_16x16_ssse3) uint64_t x264_pixel_sa8d_satd_16x16_ssse3 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define x264_pixel_sa8d_satd_16x16_ssse3_atom x264_template(pixel_sa8d_satd_16x16_ssse3_atom) uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define x264_pixel_sa8d_satd_16x16_sse4 x264_template(pixel_sa8d_satd_16x16_sse4) uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define x264_pixel_sa8d_satd_16x16_avx x264_template(pixel_sa8d_satd_16x16_avx) uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define x264_pixel_sa8d_satd_16x16_xop x264_template(pixel_sa8d_satd_16x16_xop) uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define x264_pixel_sa8d_satd_16x16_avx2 x264_template(pixel_sa8d_satd_16x16_avx2) uint64_t x264_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ); DECL_ADS( 4, mmx2 ) DECL_ADS( 2, mmx2 ) DECL_ADS( 1, mmx2 ) DECL_ADS( 4, sse2 ) DECL_ADS( 2, sse2 ) DECL_ADS( 1, sse2 ) DECL_ADS( 4, ssse3 ) DECL_ADS( 2, ssse3 ) DECL_ADS( 1, ssse3 ) DECL_ADS( 4, avx ) DECL_ADS( 2, avx ) DECL_ADS( 1, avx ) DECL_ADS( 4, avx2 ) DECL_ADS( 2, avx2 ) DECL_ADS( 1, avx2 ) #undef DECL_PIXELS #undef DECL_X1 #undef DECL_X4 #undef DECL_ADS #endif x264-master/common/x86/predict-a.asm000066400000000000000000001642351502133446700173610ustar00rootroot00000000000000;***************************************************************************** ;* predict-a.asm: x86 intra prediction ;***************************************************************************** ;* Copyright (C) 2005-2025 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz ;* Fiona Glaser ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 16 dw -3 pw_m7: times 16 dw -7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0 shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7 pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pw_1 cextern pw_2 cextern pw_4 cextern pw_8 cextern pw_16 cextern pw_00ff cextern pw_pixel_max cextern pw_0to15 %macro STORE8 1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 add r0, 4*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB], %1 mova [r0-1*FDEC_STRIDEB], %1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 mova [r0+2*FDEC_STRIDEB], %1 mova [r0+3*FDEC_STRIDEB], %1 %endmacro %macro STORE16 1-4 %if %0 > 1 mov r1d, 2*%0 .loop: mova [r0+0*FDEC_STRIDEB+0*mmsize], %1 mova [r0+0*FDEC_STRIDEB+1*mmsize], %2 mova [r0+1*FDEC_STRIDEB+0*mmsize], %1 mova [r0+1*FDEC_STRIDEB+1*mmsize], %2 %ifidn %0, 4 mova [r0+0*FDEC_STRIDEB+2*mmsize], %3 mova [r0+0*FDEC_STRIDEB+3*mmsize], %4 mova [r0+1*FDEC_STRIDEB+2*mmsize], %3 mova [r0+1*FDEC_STRIDEB+3*mmsize], %4 add r0, 2*FDEC_STRIDEB %else ; %0 == 2 add r0, 4*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB+0*mmsize], %1 mova [r0-2*FDEC_STRIDEB+1*mmsize], %2 mova [r0-1*FDEC_STRIDEB+0*mmsize], %1 mova [r0-1*FDEC_STRIDEB+1*mmsize], %2 %endif dec r1d jg .loop %else ; %0 == 1 STORE8 %1 %if HIGH_BIT_DEPTH ; Different code paths to reduce code size add r0, 6*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB], %1 mova [r0-1*FDEC_STRIDEB], %1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 add r0, 4*FDEC_STRIDEB mova [r0-2*FDEC_STRIDEB], %1 mova [r0-1*FDEC_STRIDEB], %1 mova [r0+0*FDEC_STRIDEB], %1 mova [r0+1*FDEC_STRIDEB], %1 %else add r0, 8*FDEC_STRIDE mova [r0-4*FDEC_STRIDE], %1 mova [r0-3*FDEC_STRIDE], %1 mova [r0-2*FDEC_STRIDE], %1 mova [r0-1*FDEC_STRIDE], %1 mova [r0+0*FDEC_STRIDE], %1 mova [r0+1*FDEC_STRIDE], %1 mova [r0+2*FDEC_STRIDE], %1 mova [r0+3*FDEC_STRIDE], %1 %endif ; HIGH_BIT_DEPTH %endif %endmacro %macro PRED_H_LOAD 2 ; reg, offset %if cpuflag(avx2) vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL] %elif HIGH_BIT_DEPTH movd %1, [r0+(%2)*FDEC_STRIDEB-4] SPLATW %1, %1, 1 %else SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2 %endif %endmacro %macro PRED_H_STORE 3 ; reg, offset, width %assign %%w %3*SIZEOF_PIXEL %if %%w == 8 movq [r0+(%2)*FDEC_STRIDEB], %1 %else %assign %%i 0 %rep %%w/mmsize mova [r0+(%2)*FDEC_STRIDEB+%%i], %1 %assign %%i %%i+mmsize %endrep %endif %endmacro %macro PRED_H_4ROWS 2 ; width, inc_ptr PRED_H_LOAD m0, 0 PRED_H_LOAD m1, 1 PRED_H_STORE m0, 0, %1 PRED_H_STORE m1, 1, %1 PRED_H_LOAD m0, 2 %if %2 add r0, 4*FDEC_STRIDEB %endif PRED_H_LOAD m1, 3-4*%2 PRED_H_STORE m0, 2-4*%2, %1 PRED_H_STORE m1, 3-4*%2, %1 %endmacro ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS 4-5 %if HIGH_BIT_DEPTH paddw %2, %3 psrlw %2, 1 pavgw %1, %4, %2 %else mova %5, %2 pavgb %2, %3 pxor %3, %5 pand %3, [pb_1] psubusb %2, %3 pavgb %1, %4, %2 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_4x4_h( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM avx2 cglobal predict_4x4_h, 1,1 PRED_H_4ROWS 4, 0 RET %endif ;----------------------------------------------------------------------------- ; void predict_4x4_ddl( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4_DDL 0 cglobal predict_4x4_ddl, 1,1 movu m1, [r0-FDEC_STRIDEB] PSLLPIX m2, m1, 1 mova m0, m1 %if HIGH_BIT_DEPTH PSRLPIX m1, m1, 1 pshufhw m1, m1, q2210 %else pxor m1, m2 PSRLPIX m1, m1, 1 pxor m1, m0 %endif PRED8x8_LOWPASS m0, m2, m1, m0, m3 %assign Y 0 %rep 4 PSRLPIX m0, m0, 1 movh [r0+Y*FDEC_STRIDEB], m0 %assign Y (Y+1) %endrep RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_4x4_DDL INIT_XMM avx PREDICT_4x4_DDL INIT_MMX mmx2 cglobal predict_4x4_ddl, 1,2 movu m1, [r0-FDEC_STRIDEB+4] PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2] mova m3, [r0-FDEC_STRIDEB+8] mova [r0+0*FDEC_STRIDEB], m0 pshufw m4, m3, q3321 PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3 mova [r0+3*FDEC_STRIDEB], m2 pshufw m1, m0, q0021 punpckldq m1, m2 mova [r0+1*FDEC_STRIDEB], m1 psllq m0, 16 PALIGNR m2, m0, 6, m0 mova [r0+2*FDEC_STRIDEB], m2 RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 PREDICT_4x4_DDL %endif ;----------------------------------------------------------------------------- ; void predict_4x4_vr( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH == 0 INIT_MMX ssse3 cglobal predict_4x4_vr, 1,1 movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0 mova m4, m1 palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt pavgb m4, m1 palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0 mova m0, m1 palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1 mova m2, m1 palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2 PRED8x8_LOWPASS m2, m0, m1, m2, m3 pshufw m0, m2, 0 psrlq m2, 16 movd [r0+0*FDEC_STRIDEB], m4 palignr m4, m0, 7 movd [r0+1*FDEC_STRIDEB], m2 psllq m0, 8 movd [r0+2*FDEC_STRIDEB], m4 palignr m2, m0, 7 movd [r0+3*FDEC_STRIDEB], m2 RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4 4 cglobal predict_4x4_ddr, 1,1 %if HIGH_BIT_DEPTH movu m2, [r0-1*FDEC_STRIDEB-8] pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2 pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1 pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0 movhps m3, [r0+3*FDEC_STRIDEB-8] %else ; !HIGH_BIT_DEPTH movd m0, [r0+2*FDEC_STRIDEB-4] movd m1, [r0+0*FDEC_STRIDEB-4] punpcklbw m0, [r0+1*FDEC_STRIDEB-4] punpcklbw m1, [r0-1*FDEC_STRIDEB-4] punpckhwd m0, m1 movd m2, [r0-1*FDEC_STRIDEB] %if cpuflag(ssse3) palignr m2, m0, 4 %else psllq m2, 32 punpckhdq m0, m2 SWAP 2, 0 %endif movd m3, [r0+3*FDEC_STRIDEB-4] psllq m3, 32 %endif ; !HIGH_BIT_DEPTH PSRLPIX m1, m2, 1 mova m0, m2 PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3 PRED8x8_LOWPASS m0, m2, m1, m0, m3 %assign Y 3 movh [r0+Y*FDEC_STRIDEB], m0 %rep 3 %assign Y (Y-1) PSRLPIX m0, m0, 1 movh [r0+Y*FDEC_STRIDEB], m0 %endrep RET ;----------------------------------------------------------------------------- ; void predict_4x4_vr( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_vr, 1,1 %if HIGH_BIT_DEPTH movu m1, [r0-1*FDEC_STRIDEB-8] pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0 %else ; !HIGH_BIT_DEPTH movd m0, [r0+2*FDEC_STRIDEB-4] movd m1, [r0+0*FDEC_STRIDEB-4] punpcklbw m0, [r0+1*FDEC_STRIDEB-4] punpcklbw m1, [r0-1*FDEC_STRIDEB-4] punpckhwd m0, m1 movd m1, [r0-1*FDEC_STRIDEB] %if cpuflag(ssse3) palignr m1, m0, 4 %else psllq m1, 32 punpckhdq m0, m1 SWAP 1, 0 %endif %endif ; !HIGH_BIT_DEPTH PSRLPIX m2, m1, 1 PSRLPIX m0, m1, 2 pavg%1 m4, m1, m2 PSRLPIX m4, m4, 3 PRED8x8_LOWPASS m2, m0, m1, m2, m3 PSLLPIX m0, m2, 6 PSRLPIX m2, m2, 2 movh [r0+0*FDEC_STRIDEB], m4 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3 movh [r0+1*FDEC_STRIDEB], m2 PSLLPIX m0, m0, 1 movh [r0+2*FDEC_STRIDEB], m4 PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0 movh [r0+3*FDEC_STRIDEB], m2 RET ;----------------------------------------------------------------------------- ; void predict_4x4_hd( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hd, 1,1 %if HIGH_BIT_DEPTH movu m1, [r0-1*FDEC_STRIDEB-8] PSLLPIX m1, m1, 1 pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3 pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2 pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1 pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0 %else movd m0, [r0-1*FDEC_STRIDEB-4] ; lt .. punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. .. PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. .. movd m1, [r0+3*FDEC_STRIDEB-4] ; l3 punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3 movd m2, [r0+1*FDEC_STRIDEB-4] ; l1 punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1 punpckh%3 m1, m2 ; l0 l1 l2 l3 punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 %endif PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2 PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1 pavg%1 m5, m1, m2 PRED8x8_LOWPASS m3, m1, m0, m2, m4 punpckl%2 m5, m3 PSRLPIX m3, m3, 4 PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4 %assign Y 3 movh [r0+Y*FDEC_STRIDEB], m5 %rep 2 %assign Y (Y-1) PSRLPIX m5, m5, 2 movh [r0+Y*FDEC_STRIDEB], m5 %endrep movh [r0+0*FDEC_STRIDEB], m3 RET %endmacro ; PREDICT_4x4 ;----------------------------------------------------------------------------- ; void predict_4x4_ddr( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_MMX mmx2 cglobal predict_4x4_ddr, 1,1 mova m0, [r0+1*FDEC_STRIDEB-8] punpckhwd m0, [r0+0*FDEC_STRIDEB-8] mova m3, [r0+3*FDEC_STRIDEB-8] punpckhwd m3, [r0+2*FDEC_STRIDEB-8] punpckhdq m3, m0 pshufw m0, m3, q3321 pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3 pshufw m1, m0, q3321 PRED8x8_LOWPASS m0, m1, m3, m0 movq [r0+3*FDEC_STRIDEB], m0 movq m2, [r0-1*FDEC_STRIDEB-0] pshufw m4, m2, q2100 pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0 movq m1, m4 PALIGNR m4, m3, 6, m3 PRED8x8_LOWPASS m1, m4, m2, m1 movq [r0+0*FDEC_STRIDEB], m1 pshufw m2, m0, q3321 punpckldq m2, m1 psllq m0, 16 PALIGNR m1, m0, 6, m0 movq [r0+1*FDEC_STRIDEB], m1 movq [r0+2*FDEC_STRIDEB], m2 movd [r0+3*FDEC_STRIDEB+4], m1 RET ;----------------------------------------------------------------------------- ; void predict_4x4_hd( pixel *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_hd, 1,1 mova m0, [r0+1*FDEC_STRIDEB-8] punpckhwd m0, [r0+0*FDEC_STRIDEB-8] mova m1, [r0+3*FDEC_STRIDEB-8] punpckhwd m1, [r0+2*FDEC_STRIDEB-8] punpckhdq m1, m0 mova m0, m1 movu m3, [r0-1*FDEC_STRIDEB-2] pshufw m4, m1, q0032 mova m7, m3 punpckldq m4, m3 PALIGNR m3, m1, 2, m2 PRED8x8_LOWPASS m2, m4, m1, m3 pavgw m0, m3 punpcklwd m5, m0, m2 punpckhwd m4, m0, m2 mova [r0+3*FDEC_STRIDEB], m5 mova [r0+1*FDEC_STRIDEB], m4 psrlq m5, 32 punpckldq m5, m4 mova [r0+2*FDEC_STRIDEB], m5 pshufw m4, m7, q2100 mova m6, [r0-1*FDEC_STRIDEB+0] pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0 PRED8x8_LOWPASS m3, m4, m6, m7 PALIGNR m3, m0, 6, m0 mova [r0+0*FDEC_STRIDEB], m3 RET INIT_XMM sse2 PREDICT_4x4 w, wd, dq, qdq INIT_XMM ssse3 PREDICT_4x4 w, wd, dq, qdq INIT_XMM avx PREDICT_4x4 w, wd, dq, qdq %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 PREDICT_4x4 b, bw, wd, dq INIT_MMX ssse3 %define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3 PREDICT_4x4 b, bw, wd, dq %endif ;----------------------------------------------------------------------------- ; void predict_4x4_hu( pixel *src ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 movq m0, [r0+0*FDEC_STRIDEB-8] punpckhwd m0, [r0+1*FDEC_STRIDEB-8] movq m1, [r0+2*FDEC_STRIDEB-8] punpckhwd m1, [r0+3*FDEC_STRIDEB-8] punpckhdq m0, m1 pshufw m1, m1, q3333 movq [r0+3*FDEC_STRIDEB], m1 pshufw m3, m0, q3321 pshufw m4, m0, q3332 pavgw m2, m0, m3 PRED8x8_LOWPASS m3, m0, m4, m3 punpcklwd m4, m2, m3 mova [r0+0*FDEC_STRIDEB], m4 psrlq m2, 16 psrlq m3, 16 punpcklwd m2, m3 mova [r0+1*FDEC_STRIDEB], m2 punpckhdq m2, m1 mova [r0+2*FDEC_STRIDEB], m2 RET %else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_4x4_hu_mmx2, 1,1 movd m1, [r0+0*FDEC_STRIDEB-4] punpcklbw m1, [r0+1*FDEC_STRIDEB-4] movd m0, [r0+2*FDEC_STRIDEB-4] punpcklbw m0, [r0+3*FDEC_STRIDEB-4] punpckhwd m1, m0 movq m0, m1 punpckhbw m1, m1 pshufw m1, m1, q3333 punpckhdq m0, m1 movq m2, m0 movq m3, m0 movq m5, m0 psrlq m3, 8 psrlq m2, 16 pavgb m5, m3 PRED8x8_LOWPASS m3, m0, m2, m3, m4 movd [r0+3*FDEC_STRIDEB], m1 punpcklbw m5, m3 movd [r0+0*FDEC_STRIDEB], m5 psrlq m5, 16 movd [r0+1*FDEC_STRIDEB], m5 psrlq m5, 16 movd [r0+2*FDEC_STRIDEB], m5 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_4x4_vl( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_4x4_V1 1 cglobal predict_4x4_vl, 1,1 movu m1, [r0-FDEC_STRIDEB] PSRLPIX m3, m1, 1 PSRLPIX m2, m1, 2 pavg%1 m4, m3, m1 PRED8x8_LOWPASS m0, m1, m2, m3, m5 movh [r0+0*FDEC_STRIDEB], m4 movh [r0+1*FDEC_STRIDEB], m0 PSRLPIX m4, m4, 1 PSRLPIX m0, m0, 1 movh [r0+2*FDEC_STRIDEB], m4 movh [r0+3*FDEC_STRIDEB], m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_4x4_V1 w INIT_XMM avx PREDICT_4x4_V1 w INIT_MMX mmx2 cglobal predict_4x4_vl, 1,4 mova m1, [r0-FDEC_STRIDEB+0] mova m2, [r0-FDEC_STRIDEB+8] mova m0, m2 PALIGNR m2, m1, 4, m4 PALIGNR m0, m1, 2, m4 mova m3, m0 pavgw m3, m1 mova [r0+0*FDEC_STRIDEB], m3 psrlq m3, 16 mova [r0+2*FDEC_STRIDEB], m3 PRED8x8_LOWPASS m0, m1, m2, m0 mova [r0+1*FDEC_STRIDEB], m0 psrlq m0, 16 mova [r0+3*FDEC_STRIDEB], m0 movzx r1d, word [r0-FDEC_STRIDEB+ 8] movzx r2d, word [r0-FDEC_STRIDEB+10] movzx r3d, word [r0-FDEC_STRIDEB+12] lea r1d, [r1+r2+1] add r3d, r2d lea r3d, [r3+r1+1] shr r1d, 1 shr r3d, 2 mov [r0+2*FDEC_STRIDEB+6], r1w mov [r0+3*FDEC_STRIDEB+6], r3w RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 PREDICT_4x4_V1 b %endif ;----------------------------------------------------------------------------- ; void predict_4x4_dc( pixel *src ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 %if HIGH_BIT_DEPTH cglobal predict_4x4_dc, 1,1 mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL] paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL] psrlq m2, 48 mova m0, [r0-FDEC_STRIDEB] HADDW m0, m1 paddw m0, [pw_4] paddw m0, m2 psrlw m0, 3 SPLATW m0, m0 mova [r0+0*FDEC_STRIDEB], m0 mova [r0+1*FDEC_STRIDEB], m0 mova [r0+2*FDEC_STRIDEB], m0 mova [r0+3*FDEC_STRIDEB], m0 RET %else ; !HIGH_BIT_DEPTH cglobal predict_4x4_dc, 1,4 pxor mm7, mm7 movd mm0, [r0-FDEC_STRIDEB] psadbw mm0, mm7 movd r3d, mm0 movzx r1d, byte [r0-1] %assign Y 1 %rep 3 movzx r2d, byte [r0+FDEC_STRIDEB*Y-1] add r1d, r2d %assign Y Y+1 %endrep lea r1d, [r1+r3+4] shr r1d, 3 imul r1d, 0x01010101 mov [r0+FDEC_STRIDEB*0], r1d mov [r0+FDEC_STRIDEB*1], r1d mov [r0+FDEC_STRIDEB*2], r1d mov [r0+FDEC_STRIDEB*3], r1d RET %endif ; HIGH_BIT_DEPTH %macro PREDICT_FILTER 4 ;----------------------------------------------------------------------------- ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters ) ;----------------------------------------------------------------------------- cglobal predict_8x8_filter, 4,6,6 add r0, 0x58*SIZEOF_PIXEL %define src r0-0x58*SIZEOF_PIXEL %if ARCH_X86_64 == 0 mov r4, r1 %define t1 r4 %define t4 r1 %else %define t1 r1 %define t4 r4 %endif test r3b, 1 je .check_top mov t4d, r2d and t4d, 8 neg t4 mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)] mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%2%3 m1, m0 mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL] punpckh%2%3 m3, m2 punpckh%3%4 m3, m1 mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m1, [src-1*FDEC_STRIDEB] PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0 PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2 PRED8x8_LOWPASS m3, m1, m4, m3, m5 mova [t1+8*SIZEOF_PIXEL], m3 movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL] movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL] lea t4d, [t4*3+2] add t4d, r5d shr t4d, 2 mov [t1+7*SIZEOF_PIXEL], t4%1 mov [t1+6*SIZEOF_PIXEL], t4%1 test r3b, 2 je .done .check_top: %if SIZEOF_PIXEL==1 && cpuflag(ssse3) INIT_XMM cpuname movu m3, [src-1*FDEC_STRIDEB] movhps m0, [src-1*FDEC_STRIDEB-8] test r2b, 8 je .fix_lt_2 .do_top: and r2d, 4 %if ARCH_X86_64 lea r3, [shuf_fixtr] pshufb m3, [r3+r2*4] %else pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr %endif psrldq m1, m3, 15 PALIGNR m2, m3, m0, 15, m0 PALIGNR m1, m3, 1, m5 PRED8x8_LOWPASS m0, m2, m1, m3, m5 mova [t1+16*SIZEOF_PIXEL], m0 psrldq m0, 15 movd [t1+32*SIZEOF_PIXEL], m0 .done: REP_RET .fix_lt_2: pslldq m0, m3, 15 jmp .do_top %else mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL] mova m3, [src-1*FDEC_STRIDEB] mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL] test r2b, 8 je .fix_lt_2 test r2b, 4 je .fix_tr_1 .do_top: PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0 PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5 PRED8x8_LOWPASS m4, m2, m0, m3, m5 mova [t1+16*SIZEOF_PIXEL], m4 test r3b, 4 je .done PSRLPIX m5, m1, 7 PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3 PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4 PRED8x8_LOWPASS m0, m2, m5, m1, m4 mova [t1+24*SIZEOF_PIXEL], m0 PSRLPIX m0, m0, 7 movd [t1+32*SIZEOF_PIXEL], m0 .done: REP_RET .fix_lt_2: PSLLPIX m0, m3, 7 test r2b, 4 jne .do_top .fix_tr_1: punpckh%1%2 m1, m3, m3 pshuf%2 m1, m1, q3333 jmp .do_top %endif %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_FILTER w, d, q, dq INIT_XMM ssse3 PREDICT_FILTER w, d, q, dq INIT_XMM avx PREDICT_FILTER w, d, q, dq %else INIT_MMX mmx2 PREDICT_FILTER b, w, d, q INIT_MMX ssse3 PREDICT_FILTER b, w, d, q %endif ;----------------------------------------------------------------------------- ; void predict_8x8_v( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_V 0 cglobal predict_8x8_v, 2,2 mova m0, [r1+16*SIZEOF_PIXEL] STORE8 m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse PREDICT_8x8_V %else INIT_MMX mmx2 PREDICT_8x8_V %endif ;----------------------------------------------------------------------------- ; void predict_8x8_h( pixel *src, pixel edge[36] ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_H 2 cglobal predict_8x8_h, 2,2 movu m1, [r1+7*SIZEOF_PIXEL] add r0, 4*FDEC_STRIDEB punpckl%1 m2, m1, m1 punpckh%1 m1, m1 %assign Y 0 %rep 8 %assign i 1+Y/4 SPLAT%2 m0, m %+ i, (3-Y)&3 mova [r0+(Y-4)*FDEC_STRIDEB], m0 %assign Y Y+1 %endrep RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_H wd, D %else INIT_MMX mmx2 PREDICT_8x8_H bw, W %endif ;----------------------------------------------------------------------------- ; void predict_8x8_dc( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH INIT_XMM sse2 cglobal predict_8x8_dc, 2,2 movu m0, [r1+14] paddw m0, [r1+32] HADDW m0, m1 paddw m0, [pw_8] psrlw m0, 4 SPLATW m0, m0 STORE8 m0 RET %else ; !HIGH_BIT_DEPTH INIT_MMX mmx2 cglobal predict_8x8_dc, 2,2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] psadbw mm1, [r1+16] paddw mm0, [pw_8] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8 mm0 RET %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8_dc_top ( pixel *src, pixel *edge ); ; void predict_8x8_dc_left( pixel *src, pixel *edge ); ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 3 cglobal %1, 2,2 %3 m0, [r1+%2] HADDW m0, m1 paddw m0, [pw_4] psrlw m0, 3 SPLATW m0, m0 STORE8 m0 RET %endmacro INIT_XMM sse2 PREDICT_8x8_DC predict_8x8_dc_top , 32, mova PREDICT_8x8_DC predict_8x8_dc_left, 14, movu %else ; !HIGH_BIT_DEPTH %macro PREDICT_8x8_DC 2 cglobal %1, 2,2 pxor mm0, mm0 psadbw mm0, [r1+%2] paddw mm0, [pw_4] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8 mm0 RET %endmacro INIT_MMX PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16 PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7 %endif ; HIGH_BIT_DEPTH ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe ; size on the 8-bit mmx functions below if we know sse2 is available. %macro PREDICT_8x8_DDLR 0 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl, 2,2,7 mova m0, [r1+16*SIZEOF_PIXEL] mova m1, [r1+24*SIZEOF_PIXEL] %if cpuflag(cache64) movd m5, [r1+32*SIZEOF_PIXEL] palignr m3, m1, m0, 1*SIZEOF_PIXEL palignr m5, m5, m1, 1*SIZEOF_PIXEL palignr m4, m1, m0, 7*SIZEOF_PIXEL %else movu m3, [r1+17*SIZEOF_PIXEL] movu m4, [r1+23*SIZEOF_PIXEL] movu m5, [r1+25*SIZEOF_PIXEL] %endif PSLLPIX m2, m0, 1 add r0, FDEC_STRIDEB*4 PRED8x8_LOWPASS m0, m2, m3, m0, m6 PRED8x8_LOWPASS m1, m4, m5, m1, m6 mova [r0+3*FDEC_STRIDEB], m1 %assign Y 2 %rep 6 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 PSLLPIX m0, m0, 1 mova [r0+Y*FDEC_STRIDEB], m1 %assign Y (Y-1) %endrep PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_ddr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr, 2,2,7 add r0, FDEC_STRIDEB*4 mova m0, [r1+ 8*SIZEOF_PIXEL] mova m1, [r1+16*SIZEOF_PIXEL] ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit movu m2, [r1+ 7*SIZEOF_PIXEL] movu m5, [r1+17*SIZEOF_PIXEL] %if cpuflag(cache64) palignr m3, m1, m0, 1*SIZEOF_PIXEL palignr m4, m1, m0, 7*SIZEOF_PIXEL %else movu m3, [r1+ 9*SIZEOF_PIXEL] movu m4, [r1+15*SIZEOF_PIXEL] %endif PRED8x8_LOWPASS m0, m2, m3, m0, m6 PRED8x8_LOWPASS m1, m4, m5, m1, m6 mova [r0+3*FDEC_STRIDEB], m0 %assign Y -4 %rep 6 PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2 PSLLPIX m0, m0, 1 mova [r0+Y*FDEC_STRIDEB], m1 %assign Y (Y+1) %endrep PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m1 RET %endmacro ; PREDICT_8x8_DDLR %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_DDLR INIT_XMM ssse3 PREDICT_8x8_DDLR INIT_XMM cache64, ssse3 PREDICT_8x8_DDLR %elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_DDLR %endif ;----------------------------------------------------------------------------- ; void predict_8x8_hu( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HU 2 cglobal predict_8x8_hu, 2,2,8 add r0, 4*FDEC_STRIDEB %if HIGH_BIT_DEPTH %if cpuflag(ssse3) movu m5, [r1+7*SIZEOF_PIXEL] pshufb m5, [pw_reverse] %else movq m6, [r1+7*SIZEOF_PIXEL] movq m5, [r1+11*SIZEOF_PIXEL] pshuflw m6, m6, q0123 pshuflw m5, m5, q0123 movlhps m5, m6 %endif ; cpuflag psrldq m2, m5, 2 pshufd m3, m5, q0321 pshufhw m2, m2, q2210 pshufhw m3, m3, q1110 pavgw m4, m5, m2 %else ; !HIGH_BIT_DEPTH movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 psllq m1, 56 ; l7 .. .. .. .. .. .. .. mova m2, m0 psllw m0, 8 psrlw m2, 8 por m2, m0 mova m3, m2 mova m4, m2 mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0 psrlq m3, 16 psrlq m2, 8 por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw m1, m1 por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2 pavgb m4, m2 %endif ; !HIGH_BIT_DEPTH PRED8x8_LOWPASS m2, m3, m5, m2, m6 punpckh%2 m0, m4, m2 ; p8 p7 p6 p5 punpckl%2 m4, m2 ; p4 p3 p2 p1 PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3 pshuf%1 m1, m0, q3321 PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3 pshuf%1 m2, m0, q3332 PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3 pshuf%1 m3, m0, q3333 mova [r0-4*FDEC_STRIDEB], m4 mova [r0-3*FDEC_STRIDEB], m5 mova [r0-2*FDEC_STRIDEB], m6 mova [r0-1*FDEC_STRIDEB], m7 mova [r0+0*FDEC_STRIDEB], m0 mova [r0+1*FDEC_STRIDEB], m1 mova [r0+2*FDEC_STRIDEB], m2 mova [r0+3*FDEC_STRIDEB], m3 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_HU d, wd INIT_XMM ssse3 PREDICT_8x8_HU d, wd INIT_XMM avx PREDICT_8x8_HU d, wd %elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_HU w, bw %endif ;----------------------------------------------------------------------------- ; void predict_8x8_vr( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_VR 1 cglobal predict_8x8_vr, 2,3 mova m2, [r1+16*SIZEOF_PIXEL] %ifidn cpuname, ssse3 mova m0, [r1+8*SIZEOF_PIXEL] palignr m3, m2, m0, 7*SIZEOF_PIXEL palignr m1, m2, m0, 6*SIZEOF_PIXEL %else movu m3, [r1+15*SIZEOF_PIXEL] movu m1, [r1+14*SIZEOF_PIXEL] %endif pavg%1 m4, m3, m2 add r0, FDEC_STRIDEB*4 PRED8x8_LOWPASS m3, m1, m2, m3, m5 mova [r0-4*FDEC_STRIDEB], m4 mova [r0-3*FDEC_STRIDEB], m3 mova m1, [r1+8*SIZEOF_PIXEL] PSLLPIX m0, m1, 1 PSLLPIX m2, m1, 2 PRED8x8_LOWPASS m0, m1, m2, m0, m6 %assign Y -2 %rep 5 PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5 mova [r0+Y*FDEC_STRIDEB], m4 PSLLPIX m0, m0, 1 SWAP 3, 4 %assign Y (Y+1) %endrep PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0 mova [r0+Y*FDEC_STRIDEB], m4 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_VR w INIT_XMM ssse3 PREDICT_8x8_VR w INIT_XMM avx PREDICT_8x8_VR w %elif ARCH_X86_64 == 0 INIT_MMX mmx2 PREDICT_8x8_VR b %endif %macro LOAD_PLANE_ARGS 0 %if cpuflag(avx2) && ARCH_X86_64 == 0 vpbroadcastw m0, r1m vpbroadcastw m2, r2m vpbroadcastw m4, r3m %elif mmsize == 8 ; MMX is only used on x86_32 SPLATW m0, r1m SPLATW m2, r2m SPLATW m4, r3m %else movd xm0, r1m movd xm2, r2m movd xm4, r3m SPLATW m0, xm0 SPLATW m2, xm2 SPLATW m4, xm4 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0 %macro PREDICT_CHROMA_P_MMX 1 cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS movq m1, m2 pmullw m2, [pw_0to15] psllw m1, 2 paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b} mov r1d, %1 ALIGN 4 .loop: movq m5, m0 movq m6, m1 psraw m5, 5 psraw m6, 5 packuswb m5, m6 movq [r0], m5 paddsw m0, m4 paddsw m1, m4 add r0, FDEC_STRIDE dec r1d jg .loop RET %endmacro ; PREDICT_CHROMA_P_MMX INIT_MMX mmx2 PREDICT_CHROMA_P_MMX 8 PREDICT_CHROMA_P_MMX 16 %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH %macro PREDICT_CHROMA_P 1 %if HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2,7 LOAD_PLANE_ARGS mova m3, [pw_pixel_max] pxor m1, m1 pmullw m2, [pw_43210123] ; b %if %1 == 16 pmullw m5, m4, [pw_m7] ; c %else pmullw m5, m4, [pw_m3] %endif paddw m5, [pw_16] %if mmsize == 32 mova xm6, xm4 paddw m4, m4 paddw m5, m6 %endif mov r1d, %1/(mmsize/16) .loop: paddsw m6, m2, m5 paddsw m6, m0 psraw m6, 5 CLIPW m6, m1, m3 paddw m5, m4 %if mmsize == 32 vextracti128 [r0], m6, 1 mova [r0+FDEC_STRIDEB], xm6 add r0, 2*FDEC_STRIDEB %else mova [r0], m6 add r0, FDEC_STRIDEB %endif dec r1d jg .loop RET %else ; !HIGH_BIT_DEPTH cglobal predict_8x%1c_p_core, 1,2 LOAD_PLANE_ARGS %if mmsize == 32 vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 pmullw m2, m1 mova xm1, xm4 ; zero upper half paddsw m4, m4 paddsw m0, m1 %else pmullw m2, [pw_0to15] %endif paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b} paddsw m1, m0, m4 paddsw m4, m4 mov r1d, %1/(mmsize/8) .loop: psraw m2, m0, 5 psraw m3, m1, 5 paddsw m0, m4 paddsw m1, m4 packuswb m2, m3 %if mmsize == 32 movq [r0+FDEC_STRIDE*1], xm2 movhps [r0+FDEC_STRIDE*3], xm2 vextracti128 xm2, m2, 1 movq [r0+FDEC_STRIDE*0], xm2 movhps [r0+FDEC_STRIDE*2], xm2 %else movq [r0+FDEC_STRIDE*0], xm2 movhps [r0+FDEC_STRIDE*1], xm2 %endif add r0, FDEC_STRIDE*mmsize/8 dec r1d jg .loop RET %endif ; HIGH_BIT_DEPTH %endmacro ; PREDICT_CHROMA_P INIT_XMM sse2 PREDICT_CHROMA_P 8 PREDICT_CHROMA_P 16 INIT_XMM avx PREDICT_CHROMA_P 8 PREDICT_CHROMA_P 16 INIT_YMM avx2 PREDICT_CHROMA_P 8 PREDICT_CHROMA_P 16 ;----------------------------------------------------------------------------- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- %if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0 INIT_MMX mmx2 cglobal predict_16x16_p_core, 1,2 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 pmullw mm5, [pw_0to15] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} mov r1d, 16 ALIGN 4 .loop: movq mm5, mm0 movq mm6, mm1 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0], mm5 movq mm5, mm2 movq mm6, mm3 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0+8], mm5 paddsw mm0, mm4 paddsw mm1, mm4 paddsw mm2, mm4 paddsw mm3, mm4 add r0, FDEC_STRIDE dec r1d jg .loop RET %endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64 %macro PREDICT_16x16_P 0 cglobal predict_16x16_p_core, 1,2,8 movd m0, r1m movd m1, r2m movd m2, r3m SPLATW m0, m0, 0 SPLATW m1, m1, 0 SPLATW m2, m2, 0 pmullw m3, m1, [pw_0to15] psllw m1, 3 %if HIGH_BIT_DEPTH pxor m6, m6 mov r1d, 16 .loop: mova m4, m0 mova m5, m0 mova m7, m3 paddsw m7, m6 paddsw m4, m7 paddsw m7, m1 paddsw m5, m7 psraw m4, 5 psraw m5, 5 CLIPW m4, [pb_0], [pw_pixel_max] CLIPW m5, [pb_0], [pw_pixel_max] mova [r0], m4 mova [r0+16], m5 add r0, FDEC_STRIDEB paddw m6, m2 %else ; !HIGH_BIT_DEPTH paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} paddsw m7, m2, m2 mov r1d, 8 ALIGN 4 .loop: psraw m3, m0, 5 psraw m4, m1, 5 paddsw m5, m0, m2 paddsw m6, m1, m2 psraw m5, 5 psraw m6, 5 packuswb m3, m4 packuswb m5, m6 mova [r0+FDEC_STRIDE*0], m3 mova [r0+FDEC_STRIDE*1], m5 paddsw m0, m7 paddsw m1, m7 add r0, FDEC_STRIDE*2 %endif ; !HIGH_BIT_DEPTH dec r1d jg .loop RET %endmacro ; PREDICT_16x16_P INIT_XMM sse2 PREDICT_16x16_P %if HIGH_BIT_DEPTH == 0 INIT_XMM avx PREDICT_16x16_P %endif INIT_YMM avx2 cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH LOAD_PLANE_ARGS %if HIGH_BIT_DEPTH pmullw m2, [pw_0to15] pxor m5, m5 pxor m6, m6 mova m7, [pw_pixel_max] mov r1d, 8 .loop: paddsw m1, m2, m5 paddw m5, m4 paddsw m1, m0 paddsw m3, m2, m5 psraw m1, 5 paddsw m3, m0 psraw m3, 5 CLIPW m1, m6, m7 mova [r0+0*FDEC_STRIDEB], m1 CLIPW m3, m6, m7 mova [r0+1*FDEC_STRIDEB], m3 paddw m5, m4 add r0, 2*FDEC_STRIDEB %else ; !HIGH_BIT_DEPTH vbroadcasti128 m1, [pw_0to15] mova xm3, xm4 ; zero high bits pmullw m1, m2 psllw m2, 3 paddsw m0, m3 paddsw m0, m1 ; X+1*C X+0*C paddsw m1, m0, m2 ; Y+1*C Y+0*C paddsw m4, m4 mov r1d, 4 .loop: psraw m2, m0, 5 psraw m3, m1, 5 paddsw m0, m4 paddsw m1, m4 packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C vextracti128 [r0+0*FDEC_STRIDE], m2, 1 mova [r0+1*FDEC_STRIDE], xm2 psraw m2, m0, 5 psraw m3, m1, 5 paddsw m0, m4 paddsw m1, m4 packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C vextracti128 [r0+2*FDEC_STRIDE], m2, 1 mova [r0+3*FDEC_STRIDE], xm2 add r0, FDEC_STRIDE*4 %endif ; !HIGH_BIT_DEPTH dec r1d jg .loop RET %if HIGH_BIT_DEPTH == 0 %macro PREDICT_8x8 0 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl, 2,2 mova m0, [r1+16] %ifidn cpuname, ssse3 movd m2, [r1+32] palignr m2, m0, 1 %else movu m2, [r1+17] %endif pslldq m1, m0, 1 add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS m0, m1, m2, m0, m3 %assign Y -4 %rep 8 psrldq m0, 1 movq [r0+Y*FDEC_STRIDE], m0 %assign Y (Y+1) %endrep RET %ifnidn cpuname, ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr, 2,2 movu m0, [r1+8] movu m1, [r1+7] psrldq m2, m0, 1 add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS m0, m1, m2, m0, m3 psrldq m1, m0, 1 %assign Y 3 %rep 3 movq [r0+Y*FDEC_STRIDE], m0 movq [r0+(Y-1)*FDEC_STRIDE], m1 psrldq m0, 2 psrldq m1, 2 %assign Y (Y-2) %endrep movq [r0-3*FDEC_STRIDE], m0 movq [r0-4*FDEC_STRIDE], m1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_vl( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vl, 2,2 mova m0, [r1+16] pslldq m1, m0, 1 psrldq m2, m0, 1 pavgb m3, m0, m2 add r0, FDEC_STRIDE*4 PRED8x8_LOWPASS m0, m1, m2, m0, m5 ; m0: (t0 + 2*t1 + t2 + 2) >> 2 ; m3: (t0 + t1 + 1) >> 1 %assign Y -4 %rep 3 psrldq m0, 1 movq [r0+ Y *FDEC_STRIDE], m3 movq [r0+(Y+1)*FDEC_STRIDE], m0 psrldq m3, 1 %assign Y (Y+2) %endrep psrldq m0, 1 movq [r0+ Y *FDEC_STRIDE], m3 movq [r0+(Y+1)*FDEC_STRIDE], m0 RET %endif ; !ssse3 ;----------------------------------------------------------------------------- ; void predict_8x8_vr( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vr, 2,2 movu m2, [r1+8] add r0, 4*FDEC_STRIDE pslldq m1, m2, 2 pslldq m0, m2, 1 pavgb m3, m2, m0 PRED8x8_LOWPASS m0, m2, m1, m0, m4 movhps [r0-4*FDEC_STRIDE], m3 movhps [r0-3*FDEC_STRIDE], m0 %if cpuflag(ssse3) punpckhqdq m3, m3 pshufb m0, [shuf_vr] palignr m3, m0, 13 %else mova m2, m0 mova m1, [pw_00ff] pand m1, m0 psrlw m0, 8 packuswb m1, m0 pslldq m1, 4 movhlps m3, m1 shufps m1, m2, q3210 psrldq m3, 5 psrldq m1, 5 SWAP 0, 1 %endif movq [r0+3*FDEC_STRIDE], m0 movq [r0+2*FDEC_STRIDE], m3 psrldq m0, 1 psrldq m3, 1 movq [r0+1*FDEC_STRIDE], m0 movq [r0+0*FDEC_STRIDE], m3 psrldq m0, 1 psrldq m3, 1 movq [r0-1*FDEC_STRIDE], m0 movq [r0-2*FDEC_STRIDE], m3 RET %endmacro ; PREDICT_8x8 INIT_XMM sse2 PREDICT_8x8 INIT_XMM ssse3 PREDICT_8x8 INIT_XMM avx PREDICT_8x8 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8_vl( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_VL_10 1 cglobal predict_8x8_vl, 2,2,8 mova m0, [r1+16*SIZEOF_PIXEL] mova m1, [r1+24*SIZEOF_PIXEL] PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4 PSRLPIX m4, m1, 1 pavg%1 m6, m0, m2 pavg%1 m7, m1, m4 add r0, FDEC_STRIDEB*4 mova [r0-4*FDEC_STRIDEB], m6 PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5 mova [r0-2*FDEC_STRIDEB], m3 PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5 mova [r0+0*FDEC_STRIDEB], m3 PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5 mova [r0+2*FDEC_STRIDEB], m7 PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6 PSLLPIX m5, m0, 1 PRED8x8_LOWPASS m0, m5, m2, m0, m7 PRED8x8_LOWPASS m1, m3, m4, m1, m7 PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2 mova [r0-3*FDEC_STRIDEB], m4 PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2 mova [r0-1*FDEC_STRIDEB], m4 PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2 mova [r0+1*FDEC_STRIDEB], m4 PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2 mova [r0+3*FDEC_STRIDEB], m1 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_VL_10 w INIT_XMM ssse3 PREDICT_8x8_VL_10 w INIT_XMM avx PREDICT_8x8_VL_10 w %else INIT_MMX mmx2 PREDICT_8x8_VL_10 b %endif ;----------------------------------------------------------------------------- ; void predict_8x8_hd( pixel *src, pixel *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 2 cglobal predict_8x8_hd, 2,2 add r0, 4*FDEC_STRIDEB mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6 movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7 %ifidn cpuname, ssse3 mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0 mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0 palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5 %else movu m2, [r1+15*SIZEOF_PIXEL] movu m4, [r1+ 9*SIZEOF_PIXEL] %endif ; cpuflag pavg%1 m3, m0, m1 PRED8x8_LOWPASS m0, m4, m1, m0, m5 PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1 PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0 PRED8x8_LOWPASS m1, m4, m2, m1, m5 ; .. p11 p10 p9 punpckh%2 m2, m3, m0 ; p8 p7 p6 p5 punpckl%2 m3, m0 ; p4 p3 p2 p1 mova [r0+3*FDEC_STRIDEB], m3 PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5 mova [r0+2*FDEC_STRIDEB], m0 PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5 mova [r0+1*FDEC_STRIDEB], m0 PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3 mova [r0+0*FDEC_STRIDEB], m0 mova [r0-1*FDEC_STRIDEB], m2 PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5 mova [r0-2*FDEC_STRIDEB], m0 PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5 mova [r0-3*FDEC_STRIDEB], m0 PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2 mova [r0-4*FDEC_STRIDEB], m1 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_8x8_HD w, wd INIT_XMM ssse3 PREDICT_8x8_HD w, wd INIT_XMM avx PREDICT_8x8_HD w, wd %else INIT_MMX mmx2 PREDICT_8x8_HD b, bw ;----------------------------------------------------------------------------- ; void predict_8x8_hd( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8_HD 0 cglobal predict_8x8_hd, 2,2 add r0, 4*FDEC_STRIDE movu m1, [r1+7] movu m3, [r1+8] movu m2, [r1+9] pavgb m4, m1, m3 PRED8x8_LOWPASS m0, m1, m2, m3, m5 punpcklbw m4, m0 movhlps m0, m4 %assign Y 3 %rep 3 movq [r0+(Y)*FDEC_STRIDE], m4 movq [r0+(Y-4)*FDEC_STRIDE], m0 psrldq m4, 2 psrldq m0, 2 %assign Y (Y-1) %endrep movq [r0+(Y)*FDEC_STRIDE], m4 movq [r0+(Y-4)*FDEC_STRIDE], m0 RET %endmacro INIT_XMM sse2 PREDICT_8x8_HD INIT_XMM avx PREDICT_8x8_HD %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- ; void predict_8x8_hu( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- INIT_MMX cglobal predict_8x8_hu_sse2, 2,2 add r0, 4*FDEC_STRIDE movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7 pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1 movq mm2, mm0 psllw mm0, 8 psrlw mm2, 8 por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 psllq mm1, 56 ; l7 .. .. .. .. .. .. .. movq mm3, mm2 movq mm4, mm2 movq mm5, mm2 psrlq mm2, 8 psrlq mm3, 16 por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1 punpckhbw mm1, mm1 por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2 pavgb mm4, mm2 PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6 movq2dq xmm0, mm4 movq2dq xmm1, mm1 punpcklbw xmm0, xmm1 punpckhbw mm4, mm1 %assign Y -4 %rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 psrldq xmm0, 2 %assign Y (Y+1) %endrep pshufw mm5, mm4, q3321 pshufw mm6, mm4, q3332 pshufw mm7, mm4, q3333 movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+0*FDEC_STRIDE], mm4 movq [r0+1*FDEC_STRIDE], mm5 movq [r0+2*FDEC_STRIDE], mm6 movq [r0+3*FDEC_STRIDE], mm7 RET INIT_XMM cglobal predict_8x8_hu_ssse3, 2,2 add r0, 4*FDEC_STRIDE movq m3, [r1+7] pshufb m3, [shuf_hu] psrldq m1, m3, 1 psrldq m2, m3, 2 pavgb m0, m1, m3 PRED8x8_LOWPASS m1, m3, m2, m1, m4 punpcklbw m0, m1 %assign Y -4 %rep 3 movq [r0+ Y *FDEC_STRIDE], m0 movhps [r0+(Y+4)*FDEC_STRIDE], m0 psrldq m0, 2 pshufhw m0, m0, q2210 %assign Y (Y+1) %endrep movq [r0+ Y *FDEC_STRIDE], m0 movhps [r0+(Y+4)*FDEC_STRIDE], m0 RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void predict_8x8c_v( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PREDICT_8x8C_V 0 cglobal predict_8x8c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] STORE8 m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse PREDICT_8x8C_V %else INIT_MMX mmx PREDICT_8x8C_V %endif %if HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x8c_v_mmx, 1,1 mova m0, [r0 - FDEC_STRIDEB] mova m1, [r0 - FDEC_STRIDEB + 8] %assign Y 0 %rep 8 mova [r0 + (Y&1)*FDEC_STRIDEB], m0 mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1 %if (Y&1) && (Y!=7) add r0, FDEC_STRIDEB*2 %endif %assign Y Y+1 %endrep RET %endif %macro PREDICT_8x16C_V 0 cglobal predict_8x16c_v, 1,1 mova m0, [r0 - FDEC_STRIDEB] STORE16 m0 RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse PREDICT_8x16C_V %else INIT_MMX mmx PREDICT_8x16C_V %endif ;----------------------------------------------------------------------------- ; void predict_8x8c_h( uint8_t *src ) ;----------------------------------------------------------------------------- %macro PREDICT_C_H 0 cglobal predict_8x8c_h, 1,1 %if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif PRED_H_4ROWS 8, 1 PRED_H_4ROWS 8, 0 RET cglobal predict_8x16c_h, 1,2 %if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif mov r1d, 4 .loop: PRED_H_4ROWS 8, 1 dec r1d jg .loop RET %endmacro INIT_MMX mmx2 PREDICT_C_H %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_C_H INIT_XMM avx2 PREDICT_C_H %else INIT_MMX ssse3 PREDICT_C_H %endif ;----------------------------------------------------------------------------- ; void predict_8x8c_dc( pixel *src ) ;----------------------------------------------------------------------------- %macro LOAD_LEFT 1 movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL] movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL] add r1d, r2d movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL] add r1d, r2d movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL] add r1d, r2d %endmacro %macro PREDICT_8x8C_DC 0 cglobal predict_8x8c_dc, 1,3 pxor m7, m7 %if HIGH_BIT_DEPTH movq m0, [r0-FDEC_STRIDEB+0] movq m1, [r0-FDEC_STRIDEB+8] HADDW m0, m2 HADDW m1, m2 %else ; !HIGH_BIT_DEPTH movd m0, [r0-FDEC_STRIDEB+0] movd m1, [r0-FDEC_STRIDEB+4] psadbw m0, m7 ; s0 psadbw m1, m7 ; s1 %endif add r0, FDEC_STRIDEB*4 LOAD_LEFT 0 ; s2 movd m2, r1d LOAD_LEFT 4 ; s3 movd m3, r1d punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ; s0, s1, s2, s3 pshufw m3, m0, q3312 ; s2, s1, s3, s3 pshufw m0, m0, q1310 ; s0, s1, s3, s1 paddw m0, m3 psrlw m0, 2 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 %if HIGH_BIT_DEPTH %if cpuflag(sse2) movq2dq xmm0, m0 punpcklwd xmm0, xmm0 pshufd xmm1, xmm0, q3322 punpckldq xmm0, xmm0 %assign Y 0 %rep 8 %assign i (0 + (Y/4)) movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i %assign Y Y+1 %endrep %else ; !sse2 pshufw m1, m0, q0000 pshufw m2, m0, q1111 pshufw m3, m0, q2222 pshufw m4, m0, q3333 %assign Y 0 %rep 8 %assign i (1 + (Y/4)*2) %assign j (2 + (Y/4)*2) movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j %assign Y Y+1 %endrep %endif %else ; !HIGH_BIT_DEPTH packuswb m0, m0 punpcklbw m0, m0 movq m1, m0 punpcklbw m0, m0 punpckhbw m1, m1 %assign Y 0 %rep 8 %assign i (0 + (Y/4)) movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i %assign Y Y+1 %endrep %endif RET %endmacro INIT_MMX mmx2 PREDICT_8x8C_DC %if HIGH_BIT_DEPTH INIT_MMX sse2 PREDICT_8x8C_DC %endif %if HIGH_BIT_DEPTH %macro STORE_4LINES 3 %if cpuflag(sse2) movdqa [r0+FDEC_STRIDEB*(%3-4)], %1 movdqa [r0+FDEC_STRIDEB*(%3-3)], %1 movdqa [r0+FDEC_STRIDEB*(%3-2)], %1 movdqa [r0+FDEC_STRIDEB*(%3-1)], %1 %else movq [r0+FDEC_STRIDEB*(%3-4)+0], %1 movq [r0+FDEC_STRIDEB*(%3-4)+8], %2 movq [r0+FDEC_STRIDEB*(%3-3)+0], %1 movq [r0+FDEC_STRIDEB*(%3-3)+8], %2 movq [r0+FDEC_STRIDEB*(%3-2)+0], %1 movq [r0+FDEC_STRIDEB*(%3-2)+8], %2 movq [r0+FDEC_STRIDEB*(%3-1)+0], %1 movq [r0+FDEC_STRIDEB*(%3-1)+8], %2 %endif %endmacro %else %macro STORE_4LINES 2 movq [r0+FDEC_STRIDEB*(%2-4)], %1 movq [r0+FDEC_STRIDEB*(%2-3)], %1 movq [r0+FDEC_STRIDEB*(%2-2)], %1 movq [r0+FDEC_STRIDEB*(%2-1)], %1 %endmacro %endif %macro PREDICT_8x16C_DC 0 cglobal predict_8x16c_dc, 1,3 pxor m7, m7 %if HIGH_BIT_DEPTH movq m0, [r0-FDEC_STRIDEB+0] movq m1, [r0-FDEC_STRIDEB+8] HADDW m0, m2 HADDW m1, m2 %else movd m0, [r0-FDEC_STRIDEB+0] movd m1, [r0-FDEC_STRIDEB+4] psadbw m0, m7 ; s0 psadbw m1, m7 ; s1 %endif punpcklwd m0, m1 ; s0, s1 add r0, FDEC_STRIDEB*4 LOAD_LEFT 0 ; s2 pinsrw m0, r1d, 2 LOAD_LEFT 4 ; s3 pinsrw m0, r1d, 3 ; s0, s1, s2, s3 add r0, FDEC_STRIDEB*8 LOAD_LEFT 0 ; s4 pinsrw m1, r1d, 2 LOAD_LEFT 4 ; s5 pinsrw m1, r1d, 3 ; s1, __, s4, s5 sub r0, FDEC_STRIDEB*8 pshufw m2, m0, q1310 ; s0, s1, s3, s1 pshufw m0, m0, q3312 ; s2, s1, s3, s3 pshufw m3, m1, q0302 ; s4, s1, s5, s1 pshufw m1, m1, q3322 ; s4, s4, s5, s5 paddw m0, m2 paddw m1, m3 psrlw m0, 2 psrlw m1, 2 pavgw m0, m7 pavgw m1, m7 %if HIGH_BIT_DEPTH %if cpuflag(sse2) movq2dq xmm0, m0 movq2dq xmm1, m1 punpcklwd xmm0, xmm0 punpcklwd xmm1, xmm1 pshufd xmm2, xmm0, q3322 pshufd xmm3, xmm1, q3322 punpckldq xmm0, xmm0 punpckldq xmm1, xmm1 STORE_4LINES xmm0, xmm0, 0 STORE_4LINES xmm2, xmm2, 4 STORE_4LINES xmm1, xmm1, 8 STORE_4LINES xmm3, xmm3, 12 %else pshufw m2, m0, q0000 pshufw m3, m0, q1111 pshufw m4, m0, q2222 pshufw m5, m0, q3333 STORE_4LINES m2, m3, 0 STORE_4LINES m4, m5, 4 pshufw m2, m1, q0000 pshufw m3, m1, q1111 pshufw m4, m1, q2222 pshufw m5, m1, q3333 STORE_4LINES m2, m3, 8 STORE_4LINES m4, m5, 12 %endif %else packuswb m0, m0 ; dc0, dc1, dc2, dc3 packuswb m1, m1 ; dc4, dc5, dc6, dc7 punpcklbw m0, m0 punpcklbw m1, m1 pshufw m2, m0, q1100 pshufw m3, m0, q3322 pshufw m4, m1, q1100 pshufw m5, m1, q3322 STORE_4LINES m2, 0 STORE_4LINES m3, 4 add r0, FDEC_STRIDEB*8 STORE_4LINES m4, 0 STORE_4LINES m5, 4 %endif RET %endmacro INIT_MMX mmx2 PREDICT_8x16C_DC %if HIGH_BIT_DEPTH INIT_MMX sse2 PREDICT_8x16C_DC %endif %macro PREDICT_C_DC_TOP 1 %if HIGH_BIT_DEPTH INIT_XMM cglobal predict_8x%1c_dc_top_sse2, 1,1 pxor m2, m2 mova m0, [r0 - FDEC_STRIDEB] pshufd m1, m0, q2301 paddw m0, m1 pshuflw m1, m0, q2301 pshufhw m1, m1, q2301 paddw m0, m1 psrlw m0, 1 pavgw m0, m2 STORE%1 m0 RET %else ; !HIGH_BIT_DEPTH INIT_MMX cglobal predict_8x%1c_dc_top_mmx2, 1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 punpckhbw mm1, mm0 punpcklbw mm0, mm2 psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 psrlw mm1, 1 psrlw mm0, 1 pavgw mm1, mm2 pavgw mm0, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) packuswb mm0, mm1 ; dc0,dc1 (b) STORE%1 mm0 RET %endif %endmacro PREDICT_C_DC_TOP 8 PREDICT_C_DC_TOP 16 ;----------------------------------------------------------------------------- ; void predict_16x16_v( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_16x16_V 0 cglobal predict_16x16_v, 1,2 %assign %%i 0 %rep 16*SIZEOF_PIXEL/mmsize mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize] %assign %%i %%i+1 %endrep %if 16*SIZEOF_PIXEL/mmsize == 4 STORE16 m0, m1, m2, m3 %elif 16*SIZEOF_PIXEL/mmsize == 2 STORE16 m0, m1 %else STORE16 m0 %endif RET %endmacro INIT_MMX mmx2 PREDICT_16x16_V INIT_XMM sse PREDICT_16x16_V %if HIGH_BIT_DEPTH INIT_YMM avx PREDICT_16x16_V %endif ;----------------------------------------------------------------------------- ; void predict_16x16_h( pixel *src ) ;----------------------------------------------------------------------------- %macro PREDICT_16x16_H 0 cglobal predict_16x16_h, 1,2 %if cpuflag(ssse3) && notcpuflag(avx2) mova m2, [pb_3] %endif mov r1d, 4 .loop: PRED_H_4ROWS 16, 1 dec r1d jg .loop RET %endmacro INIT_MMX mmx2 PREDICT_16x16_H %if HIGH_BIT_DEPTH INIT_XMM sse2 PREDICT_16x16_H INIT_YMM avx2 PREDICT_16x16_H %else ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3 INIT_XMM ssse3 PREDICT_16x16_H %endif ;----------------------------------------------------------------------------- ; void predict_16x16_dc( pixel *src ) ;----------------------------------------------------------------------------- %if WIN64 DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes %else DECLARE_REG_TMP 3 %endif INIT_XMM ; Returns the sum of the left pixels in r1d+r2d cglobal predict_16x16_dc_left_internal, 0,4 movzx r1d, pixel [r0-SIZEOF_PIXEL] movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL] %assign i 2*FDEC_STRIDEB %rep 7 movzx t0d, pixel [r0+i-SIZEOF_PIXEL] add r1d, t0d movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL] add r2d, t0d %assign i i+2*FDEC_STRIDEB %endrep RET %macro PRED16x16_DC 2 %if HIGH_BIT_DEPTH mova xm0, [r0 - FDEC_STRIDEB+ 0] paddw xm0, [r0 - FDEC_STRIDEB+16] HADDW xm0, xm2 paddw xm0, %1 psrlw xm0, %2 SPLATW m0, xm0 %if mmsize == 32 STORE16 m0 %else STORE16 m0, m0 %endif %else ; !HIGH_BIT_DEPTH pxor m0, m0 psadbw m0, [r0 - FDEC_STRIDE] MOVHL m1, m0 paddw m0, m1 paddusw m0, %1 psrlw m0, %2 ; dc SPLATW m0, m0 packuswb m0, m0 ; dc in bytes STORE16 m0 %endif %endmacro %macro PREDICT_16x16_DC 0 cglobal predict_16x16_dc, 1,3 call predict_16x16_dc_left_internal lea r1d, [r1+r2+16] movd xm3, r1d PRED16x16_DC xm3, 5 RET cglobal predict_16x16_dc_top, 1,2 PRED16x16_DC [pw_8], 4 RET cglobal predict_16x16_dc_left, 1,3 call predict_16x16_dc_left_internal lea r1d, [r1+r2+8] shr r1d, 4 movd xm0, r1d SPLATW m0, xm0 %if HIGH_BIT_DEPTH && mmsize == 16 STORE16 m0, m0 %else %if HIGH_BIT_DEPTH == 0 packuswb m0, m0 %endif STORE16 m0 %endif RET %endmacro INIT_XMM sse2 PREDICT_16x16_DC %if HIGH_BIT_DEPTH INIT_YMM avx2 PREDICT_16x16_DC %else INIT_XMM avx2 PREDICT_16x16_DC %endif x264-master/common/x86/predict-c.c000066400000000000000000000526741502133446700170300ustar00rootroot00000000000000/***************************************************************************** * predict-c.c: intra prediction ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "predict.h" #include "pixel.h" #define PREDICT_P_SUM(j,i)\ H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\ V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] ); #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4}; #else // !HIGH_BIT_DEPTH ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4}; #endif // HIGH_BIT_DEPTH #endif // HAVE_X86_INLINE_ASM #define PREDICT_16x16_P_CORE\ int H = 0;\ int V = 0;\ PREDICT_P_SUM(7,1)\ PREDICT_P_SUM(7,2)\ PREDICT_P_SUM(7,3)\ PREDICT_P_SUM(7,4)\ PREDICT_P_SUM(7,5)\ PREDICT_P_SUM(7,6)\ PREDICT_P_SUM(7,7)\ PREDICT_P_SUM(7,8) #define PREDICT_16x16_P_END(name)\ int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\ int b = ( 5 * H + 32 ) >> 6;\ int c = ( 5 * V + 32 ) >> 6;\ int i00 = a - b * 7 - c * 7 + 16;\ /* b*15 + c*15 can overflow: it's easier to just branch away in this rare case\ * than to try to consider it in the asm. */\ if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\ x264_predict_16x16_p_c( src );\ else\ x264_predict_16x16_p_core_##name( src, i00, b, c ); #define PREDICT_16x16_P(name, name2)\ static void predict_16x16_p_##name( pixel *src )\ {\ PREDICT_16x16_P_CORE\ PREDICT_16x16_P_END(name2)\ } #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH #define PREDICT_16x16_P_ASM\ asm (\ "movdqu %1, %%xmm1 \n"\ "movdqa %2, %%xmm0 \n"\ "pmaddwd %3, %%xmm0 \n"\ "pmaddwd %4, %%xmm1 \n"\ "paddd %%xmm1, %%xmm0 \n"\ "movhlps %%xmm0, %%xmm1 \n"\ "paddd %%xmm1, %%xmm0 \n"\ "pshuflw $14, %%xmm0, %%xmm1 \n"\ "paddd %%xmm1, %%xmm0 \n"\ "movd %%xmm0, %0 \n"\ :"=r"(H)\ :"m"(MEM_FIX(&src[-FDEC_STRIDE-1], const pixel, 8)),\ "m"(MEM_FIX(&src[-FDEC_STRIDE+8], const pixel, 8)),\ "m"(MEM_FIX(pw_12345678, const int16_t, 8)),\ "m"(MEM_FIX(pw_m87654321, const int16_t, 8))\ :"xmm0", "xmm1"\ ); #else // !HIGH_BIT_DEPTH #define PREDICT_16x16_P_ASM\ asm (\ "movq %1, %%mm1 \n"\ "movq %2, %%mm0 \n"\ "palignr $7, %3, %%mm1 \n"\ "pmaddubsw %4, %%mm0 \n"\ "pmaddubsw %5, %%mm1 \n"\ "paddw %%mm1, %%mm0 \n"\ "pshufw $14, %%mm0, %%mm1 \n"\ "paddw %%mm1, %%mm0 \n"\ "pshufw $1, %%mm0, %%mm1 \n"\ "paddw %%mm1, %%mm0 \n"\ "movd %%mm0, %0 \n"\ "movswl %w0, %0 \n"\ :"=r"(H)\ :"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\ "m"(MEM_FIX(&src[-FDEC_STRIDE+8], const pixel, 8)),\ "m"(MEM_FIX(&src[-FDEC_STRIDE-8], const pixel, 8)),\ "m"(MEM_FIX(pb_12345678, const int8_t, 8)),\ "m"(MEM_FIX(pb_m87654321, const int8_t, 8))\ :"mm0", "mm1"\ ); #endif // HIGH_BIT_DEPTH #define PREDICT_16x16_P_CORE_INLINE\ int H, V;\ PREDICT_16x16_P_ASM\ V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\ + 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\ + 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\ + 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\ + 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\ + 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\ + 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\ + 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] ); #define PREDICT_16x16_P_INLINE(name, name2)\ static void predict_16x16_p_##name( pixel *src )\ {\ PREDICT_16x16_P_CORE_INLINE\ PREDICT_16x16_P_END(name2)\ } #else // !HAVE_X86_INLINE_ASM #define PREDICT_16x16_P_INLINE(name, name2) PREDICT_16x16_P(name, name2) #endif // HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH PREDICT_16x16_P_INLINE( sse2, sse2 ) #else // !HIGH_BIT_DEPTH #if !ARCH_X86_64 PREDICT_16x16_P( mmx2, mmx2 ) #endif // !ARCH_X86_64 PREDICT_16x16_P( sse2, sse2 ) #if HAVE_X86_INLINE_ASM PREDICT_16x16_P_INLINE( ssse3, sse2 ) #endif // HAVE_X86_INLINE_ASM PREDICT_16x16_P_INLINE( avx, avx ) #endif // HIGH_BIT_DEPTH PREDICT_16x16_P_INLINE( avx2, avx2 ) #define PREDICT_8x16C_P_CORE\ int H = 0, V = 0;\ for( int i = 0; i < 4; i++ )\ H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\ for( int i = 0; i < 8; i++ )\ V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] ); #if HIGH_BIT_DEPTH #define PREDICT_8x16C_P_END(name)\ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ int b = ( 17 * H + 16 ) >> 5;\ int c = ( 5 * V + 32 ) >> 6;\ x264_predict_8x16c_p_core_##name( src, a, b, c ); #else // !HIGH_BIT_DEPTH #define PREDICT_8x16C_P_END(name)\ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\ int b = ( 17 * H + 16 ) >> 5;\ int c = ( 5 * V + 32 ) >> 6;\ int i00 = a -3*b -7*c + 16;\ x264_predict_8x16c_p_core_##name( src, i00, b, c ); #endif // HIGH_BIT_DEPTH #define PREDICT_8x16C_P(name)\ static void predict_8x16c_p_##name( pixel *src )\ {\ PREDICT_8x16C_P_CORE\ PREDICT_8x16C_P_END(name)\ } #if !ARCH_X86_64 && !HIGH_BIT_DEPTH PREDICT_8x16C_P( mmx2 ) #endif // !ARCH_X86_64 && !HIGH_BIT_DEPTH PREDICT_8x16C_P( sse2 ) PREDICT_8x16C_P( avx ) PREDICT_8x16C_P( avx2 ) #define PREDICT_8x8C_P_CORE\ int H = 0;\ int V = 0;\ PREDICT_P_SUM(3,1)\ PREDICT_P_SUM(3,2)\ PREDICT_P_SUM(3,3)\ PREDICT_P_SUM(3,4) #if HIGH_BIT_DEPTH #define PREDICT_8x8C_P_END(name)\ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ int b = ( 17 * H + 16 ) >> 5;\ int c = ( 17 * V + 16 ) >> 5;\ x264_predict_8x8c_p_core_##name( src, a, b, c ); #else // !HIGH_BIT_DEPTH #define PREDICT_8x8C_P_END(name)\ int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\ int b = ( 17 * H + 16 ) >> 5;\ int c = ( 17 * V + 16 ) >> 5;\ int i00 = a -3*b -3*c + 16;\ x264_predict_8x8c_p_core_##name( src, i00, b, c ); #endif // HIGH_BIT_DEPTH #define PREDICT_8x8C_P(name, name2)\ static void predict_8x8c_p_##name( pixel *src )\ {\ PREDICT_8x8C_P_CORE\ PREDICT_8x8C_P_END(name2)\ } #if HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH #define PREDICT_8x8C_P_ASM\ asm (\ "movdqa %1, %%xmm0 \n"\ "pmaddwd %2, %%xmm0 \n"\ "movhlps %%xmm0, %%xmm1 \n"\ "paddd %%xmm1, %%xmm0 \n"\ "pshuflw $14, %%xmm0, %%xmm1 \n"\ "paddd %%xmm1, %%xmm0 \n"\ "movd %%xmm0, %0 \n"\ :"=r"(H)\ :"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\ "m"(MEM_FIX(pw_m32101234, const int16_t, 8))\ :"xmm0", "xmm1"\ ); #else // !HIGH_BIT_DEPTH #define PREDICT_8x8C_P_ASM\ asm (\ "movq %1, %%mm0 \n"\ "pmaddubsw %2, %%mm0 \n"\ "pshufw $14, %%mm0, %%mm1 \n"\ "paddw %%mm1, %%mm0 \n"\ "pshufw $1, %%mm0, %%mm1 \n"\ "paddw %%mm1, %%mm0 \n"\ "movd %%mm0, %0 \n"\ "movswl %w0, %0 \n"\ :"=r"(H)\ :"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\ "m"(MEM_FIX(pb_m32101234, const int8_t, 8))\ :"mm0", "mm1"\ ); #endif // HIGH_BIT_DEPTH #define PREDICT_8x8C_P_CORE_INLINE\ int H, V;\ PREDICT_8x8C_P_ASM\ V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\ + 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\ + 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\ + 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\ H += -4 * src[-1*FDEC_STRIDE -1]; #define PREDICT_8x8C_P_INLINE(name, name2)\ static void predict_8x8c_p_##name( pixel *src )\ {\ PREDICT_8x8C_P_CORE_INLINE\ PREDICT_8x8C_P_END(name2)\ } #else // !HAVE_X86_INLINE_ASM #define PREDICT_8x8C_P_INLINE(name, name2) PREDICT_8x8C_P(name, name2) #endif // HAVE_X86_INLINE_ASM #if HIGH_BIT_DEPTH PREDICT_8x8C_P_INLINE( sse2, sse2 ) #else //!HIGH_BIT_DEPTH #if !ARCH_X86_64 PREDICT_8x8C_P( mmx2, mmx2 ) #endif // !ARCH_X86_64 PREDICT_8x8C_P( sse2, sse2 ) #if HAVE_X86_INLINE_ASM PREDICT_8x8C_P_INLINE( ssse3, sse2 ) #endif // HAVE_X86_INLINE_ASM #endif // HIGH_BIT_DEPTH PREDICT_8x8C_P_INLINE( avx, avx ) PREDICT_8x8C_P_INLINE( avx2, avx2 ) #if ARCH_X86_64 && !HIGH_BIT_DEPTH static void predict_8x8c_dc_left( uint8_t *src ) { int y; uint32_t s0 = 0, s1 = 0; uint64_t dc0, dc1; for( y = 0; y < 4; y++ ) { s0 += src[y * FDEC_STRIDE - 1]; s1 += src[(y+4) * FDEC_STRIDE - 1]; } dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL; dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL; for( y = 0; y < 4; y++ ) { M64( src ) = dc0; src += FDEC_STRIDE; } for( y = 0; y < 4; y++ ) { M64( src ) = dc1; src += FDEC_STRIDE; } } #endif // ARCH_X86_64 && !HIGH_BIT_DEPTH /**************************************************************************** * Exported functions: ****************************************************************************/ void x264_predict_16x16_init_mmx( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2; #if HIGH_BIT_DEPTH if( !(cpu&X264_CPU_SSE) ) return; pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2; pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2; pf[I_PRED_16x16_P] = predict_16x16_p_sse2; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_16x16_V] = x264_predict_16x16_v_avx; if( !(cpu&X264_CPU_AVX2) ) return; pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2; #else #if !ARCH_X86_64 pf[I_PRED_16x16_P] = predict_16x16_p_mmx2; #endif if( !(cpu&X264_CPU_SSE) ) return; pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2; if( cpu&X264_CPU_SSE2_IS_SLOW ) return; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2; pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2; pf[I_PRED_16x16_P] = predict_16x16_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3; #if HAVE_X86_INLINE_ASM pf[I_PRED_16x16_P] = predict_16x16_p_ssse3; #endif if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_16x16_P] = predict_16x16_p_avx; #endif // HIGH_BIT_DEPTH if( cpu&X264_CPU_AVX2 ) { pf[I_PRED_16x16_P] = predict_16x16_p_avx2; pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_avx2; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_avx2; pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2; } } void x264_predict_8x8c_init_mmx( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX) ) return; #if HIGH_BIT_DEPTH pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2; if( !(cpu&X264_CPU_SSE) ) return; pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2; pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx; if( !(cpu&X264_CPU_AVX2) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_avx2; #else #if ARCH_X86_64 pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left; #endif pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx; if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2; #if !ARCH_X86_64 pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmx2; #endif pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3; #if HAVE_X86_INLINE_ASM pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3; #endif if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx; #endif // HIGH_BIT_DEPTH if( cpu&X264_CPU_AVX2 ) { pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx2; } } void x264_predict_8x16c_init_mmx( uint32_t cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX) ) return; #if HIGH_BIT_DEPTH if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; if( !(cpu&X264_CPU_SSE) ) return; pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2; pf[I_PRED_CHROMA_P] = predict_8x16c_p_sse2; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx; if( !(cpu&X264_CPU_AVX2) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_avx2; #else pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx; if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2; pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2; #if !ARCH_X86_64 pf[I_PRED_CHROMA_P] = predict_8x16c_p_mmx2; #endif if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_CHROMA_P] = predict_8x16c_p_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx; #endif // HIGH_BIT_DEPTH if( cpu&X264_CPU_AVX2 ) { pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx2; } } void x264_predict_8x8_init_mmx( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) { if( !(cpu&X264_CPU_MMX2) ) return; #if HIGH_BIT_DEPTH if( !(cpu&X264_CPU_SSE) ) return; pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse; if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2; pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2; pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; *predict_8x8_filter = x264_predict_8x8_filter_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_cache64_ssse3; pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_cache64_ssse3; } if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx; *predict_8x8_filter = x264_predict_8x8_filter_avx; #else pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmx2; pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmx2; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmx2; pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2; pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2; *predict_8x8_filter = x264_predict_8x8_filter_mmx2; #if ARCH_X86 pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmx2; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmx2; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmx2; #endif if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; if( !(cpu&X264_CPU_SLOW_PALIGNR) ) { pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3; } pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx; pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx; pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx; pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx; pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx; #endif // HIGH_BIT_DEPTH } void x264_predict_4x4_init_mmx( uint32_t cpu, x264_predict_t pf[12] ) { if( !(cpu&X264_CPU_MMX2) ) return; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmx2; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmx2; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmx2; pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmx2; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmx2; pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmx2; #if HIGH_BIT_DEPTH if( !(cpu&X264_CPU_SSE2) ) return; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_sse2; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2; pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; if( !(cpu&X264_CPU_AVX) ) return; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx; pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx; if( !(cpu&X264_CPU_AVX2) ) return; pf[I_PRED_4x4_H] = x264_predict_4x4_h_avx2; #else pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2; if( !(cpu&X264_CPU_SSSE3) ) return; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_cache64_ssse3; #endif // HIGH_BIT_DEPTH } x264-master/common/x86/predict.h000066400000000000000000000372411502133446700166060ustar00rootroot00000000000000/***************************************************************************** * predict.h: x86 intra prediction ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_PREDICT_H #define X264_X86_PREDICT_H #define x264_predict_16x16_init_mmx x264_template(predict_16x16_init_mmx) void x264_predict_16x16_init_mmx( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_8x16c_init_mmx x264_template(predict_8x16c_init_mmx) void x264_predict_8x16c_init_mmx( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_8x8c_init_mmx x264_template(predict_8x8c_init_mmx) void x264_predict_8x8c_init_mmx ( uint32_t cpu, x264_predict_t pf[7] ); #define x264_predict_4x4_init_mmx x264_template(predict_4x4_init_mmx) void x264_predict_4x4_init_mmx ( uint32_t cpu, x264_predict_t pf[12] ); #define x264_predict_8x8_init_mmx x264_template(predict_8x8_init_mmx) void x264_predict_8x8_init_mmx ( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ); #define x264_predict_16x16_v_mmx2 x264_template(predict_16x16_v_mmx2) void x264_predict_16x16_v_mmx2( pixel *src ); #define x264_predict_16x16_v_sse x264_template(predict_16x16_v_sse) void x264_predict_16x16_v_sse ( pixel *src ); #define x264_predict_16x16_v_avx x264_template(predict_16x16_v_avx) void x264_predict_16x16_v_avx ( uint16_t *src ); #define x264_predict_16x16_h_mmx2 x264_template(predict_16x16_h_mmx2) void x264_predict_16x16_h_mmx2( pixel *src ); #define x264_predict_16x16_h_sse2 x264_template(predict_16x16_h_sse2) void x264_predict_16x16_h_sse2( uint16_t *src ); #define x264_predict_16x16_h_ssse3 x264_template(predict_16x16_h_ssse3) void x264_predict_16x16_h_ssse3( uint8_t *src ); #define x264_predict_16x16_h_avx2 x264_template(predict_16x16_h_avx2) void x264_predict_16x16_h_avx2( uint16_t *src ); #define x264_predict_16x16_dc_sse2 x264_template(predict_16x16_dc_sse2) void x264_predict_16x16_dc_sse2( pixel *src ); #define x264_predict_16x16_dc_avx2 x264_template(predict_16x16_dc_avx2) void x264_predict_16x16_dc_avx2( pixel *src ); #define x264_predict_16x16_dc_left_sse2 x264_template(predict_16x16_dc_left_sse2) void x264_predict_16x16_dc_left_sse2( pixel *src ); #define x264_predict_16x16_dc_left_avx2 x264_template(predict_16x16_dc_left_avx2) void x264_predict_16x16_dc_left_avx2( pixel *src ); #define x264_predict_16x16_dc_top_sse2 x264_template(predict_16x16_dc_top_sse2) void x264_predict_16x16_dc_top_sse2( pixel *src ); #define x264_predict_16x16_dc_top_avx2 x264_template(predict_16x16_dc_top_avx2) void x264_predict_16x16_dc_top_avx2( pixel *src ); #define x264_predict_16x16_p_core_mmx2 x264_template(predict_16x16_p_core_mmx2) void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c ); #define x264_predict_16x16_p_core_sse2 x264_template(predict_16x16_p_core_sse2) void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c ); #define x264_predict_16x16_p_core_avx x264_template(predict_16x16_p_core_avx) void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c ); #define x264_predict_16x16_p_core_avx2 x264_template(predict_16x16_p_core_avx2) void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c ); #define x264_predict_8x16c_dc_mmx2 x264_template(predict_8x16c_dc_mmx2) void x264_predict_8x16c_dc_mmx2( pixel *src ); #define x264_predict_8x16c_dc_sse2 x264_template(predict_8x16c_dc_sse2) void x264_predict_8x16c_dc_sse2( uint16_t *src ); #define x264_predict_8x16c_dc_top_mmx2 x264_template(predict_8x16c_dc_top_mmx2) void x264_predict_8x16c_dc_top_mmx2( uint8_t *src ); #define x264_predict_8x16c_dc_top_sse2 x264_template(predict_8x16c_dc_top_sse2) void x264_predict_8x16c_dc_top_sse2( uint16_t *src ); #define x264_predict_8x16c_v_mmx x264_template(predict_8x16c_v_mmx) void x264_predict_8x16c_v_mmx( uint8_t *src ); #define x264_predict_8x16c_v_sse x264_template(predict_8x16c_v_sse) void x264_predict_8x16c_v_sse( uint16_t *src ); #define x264_predict_8x16c_h_mmx2 x264_template(predict_8x16c_h_mmx2) void x264_predict_8x16c_h_mmx2( pixel *src ); #define x264_predict_8x16c_h_sse2 x264_template(predict_8x16c_h_sse2) void x264_predict_8x16c_h_sse2( uint16_t *src ); #define x264_predict_8x16c_h_ssse3 x264_template(predict_8x16c_h_ssse3) void x264_predict_8x16c_h_ssse3( uint8_t *src ); #define x264_predict_8x16c_h_avx2 x264_template(predict_8x16c_h_avx2) void x264_predict_8x16c_h_avx2( uint16_t *src ); #define x264_predict_8x16c_p_core_mmx2 x264_template(predict_8x16c_p_core_mmx2) void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); #define x264_predict_8x16c_p_core_sse2 x264_template(predict_8x16c_p_core_sse2) void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c ); #define x264_predict_8x16c_p_core_avx x264_template(predict_8x16c_p_core_avx) void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c ); #define x264_predict_8x16c_p_core_avx2 x264_template(predict_8x16c_p_core_avx2) void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c ); #define x264_predict_8x8c_p_core_mmx2 x264_template(predict_8x8c_p_core_mmx2) void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c ); #define x264_predict_8x8c_p_core_sse2 x264_template(predict_8x8c_p_core_sse2) void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c ); #define x264_predict_8x8c_p_core_avx x264_template(predict_8x8c_p_core_avx) void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c ); #define x264_predict_8x8c_p_core_avx2 x264_template(predict_8x8c_p_core_avx2) void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c ); #define x264_predict_8x8c_dc_mmx2 x264_template(predict_8x8c_dc_mmx2) void x264_predict_8x8c_dc_mmx2( pixel *src ); #define x264_predict_8x8c_dc_sse2 x264_template(predict_8x8c_dc_sse2) void x264_predict_8x8c_dc_sse2( uint16_t *src ); #define x264_predict_8x8c_dc_top_mmx2 x264_template(predict_8x8c_dc_top_mmx2) void x264_predict_8x8c_dc_top_mmx2( uint8_t *src ); #define x264_predict_8x8c_dc_top_sse2 x264_template(predict_8x8c_dc_top_sse2) void x264_predict_8x8c_dc_top_sse2( uint16_t *src ); #define x264_predict_8x8c_v_mmx x264_template(predict_8x8c_v_mmx) void x264_predict_8x8c_v_mmx( pixel *src ); #define x264_predict_8x8c_v_sse x264_template(predict_8x8c_v_sse) void x264_predict_8x8c_v_sse( uint16_t *src ); #define x264_predict_8x8c_h_mmx2 x264_template(predict_8x8c_h_mmx2) void x264_predict_8x8c_h_mmx2( pixel *src ); #define x264_predict_8x8c_h_sse2 x264_template(predict_8x8c_h_sse2) void x264_predict_8x8c_h_sse2( uint16_t *src ); #define x264_predict_8x8c_h_ssse3 x264_template(predict_8x8c_h_ssse3) void x264_predict_8x8c_h_ssse3( uint8_t *src ); #define x264_predict_8x8c_h_avx2 x264_template(predict_8x8c_h_avx2) void x264_predict_8x8c_h_avx2( uint16_t *src ); #define x264_predict_8x8_v_mmx2 x264_template(predict_8x8_v_mmx2) void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_v_sse x264_template(predict_8x8_v_sse) void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] ); #define x264_predict_8x8_h_mmx2 x264_template(predict_8x8_h_mmx2) void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_h_sse2 x264_template(predict_8x8_h_sse2) void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] ); #define x264_predict_8x8_hd_mmx2 x264_template(predict_8x8_hd_mmx2) void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_hu_mmx2 x264_template(predict_8x8_hu_mmx2) void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_dc_mmx2 x264_template(predict_8x8_dc_mmx2) void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_dc_sse2 x264_template(predict_8x8_dc_sse2) void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] ); #define x264_predict_8x8_dc_top_mmx2 x264_template(predict_8x8_dc_top_mmx2) void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_dc_top_sse2 x264_template(predict_8x8_dc_top_sse2) void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] ); #define x264_predict_8x8_dc_left_mmx2 x264_template(predict_8x8_dc_left_mmx2) void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_dc_left_sse2 x264_template(predict_8x8_dc_left_sse2) void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] ); #define x264_predict_8x8_ddl_mmx2 x264_template(predict_8x8_ddl_mmx2) void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_ddl_sse2 x264_template(predict_8x8_ddl_sse2) void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddl_ssse3 x264_template(predict_8x8_ddl_ssse3) void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddl_cache64_ssse3 x264_template(predict_8x8_ddl_cache64_ssse3) void x264_predict_8x8_ddl_cache64_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddl_avx x264_template(predict_8x8_ddl_avx) void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddr_mmx2 x264_template(predict_8x8_ddr_mmx2) void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_ddr_sse2 x264_template(predict_8x8_ddr_sse2) void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddr_ssse3 x264_template(predict_8x8_ddr_ssse3) void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddr_cache64_ssse3 x264_template(predict_8x8_ddr_cache64_ssse3) void x264_predict_8x8_ddr_cache64_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_ddr_avx x264_template(predict_8x8_ddr_avx) void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vl_sse2 x264_template(predict_8x8_vl_sse2) void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vl_ssse3 x264_template(predict_8x8_vl_ssse3) void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vl_avx x264_template(predict_8x8_vl_avx) void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vl_mmx2 x264_template(predict_8x8_vl_mmx2) void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_vr_mmx2 x264_template(predict_8x8_vr_mmx2) void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] ); #define x264_predict_8x8_vr_sse2 x264_template(predict_8x8_vr_sse2) void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vr_ssse3 x264_template(predict_8x8_vr_ssse3) void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_vr_avx x264_template(predict_8x8_vr_avx) void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_hu_sse2 x264_template(predict_8x8_hu_sse2) void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] ); #define x264_predict_8x8_hu_ssse3 x264_template(predict_8x8_hu_ssse3) void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_hu_avx x264_template(predict_8x8_hu_avx) void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_hd_sse2 x264_template(predict_8x8_hd_sse2) void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] ); #define x264_predict_8x8_hd_ssse3 x264_template(predict_8x8_hd_ssse3) void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] ); #define x264_predict_8x8_hd_avx x264_template(predict_8x8_hd_avx) void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] ); #define x264_predict_8x8_filter_mmx2 x264_template(predict_8x8_filter_mmx2) void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters ); #define x264_predict_8x8_filter_sse2 x264_template(predict_8x8_filter_sse2) void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters ); #define x264_predict_8x8_filter_ssse3 x264_template(predict_8x8_filter_ssse3) void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters ); #define x264_predict_8x8_filter_avx x264_template(predict_8x8_filter_avx) void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters ); #define x264_predict_4x4_h_avx2 x264_template(predict_4x4_h_avx2) void x264_predict_4x4_h_avx2( uint16_t *src ); #define x264_predict_4x4_ddl_mmx2 x264_template(predict_4x4_ddl_mmx2) void x264_predict_4x4_ddl_mmx2( pixel *src ); #define x264_predict_4x4_ddl_sse2 x264_template(predict_4x4_ddl_sse2) void x264_predict_4x4_ddl_sse2( uint16_t *src ); #define x264_predict_4x4_ddl_avx x264_template(predict_4x4_ddl_avx) void x264_predict_4x4_ddl_avx( uint16_t *src ); #define x264_predict_4x4_ddr_mmx2 x264_template(predict_4x4_ddr_mmx2) void x264_predict_4x4_ddr_mmx2( pixel *src ); #define x264_predict_4x4_vl_mmx2 x264_template(predict_4x4_vl_mmx2) void x264_predict_4x4_vl_mmx2( pixel *src ); #define x264_predict_4x4_vl_sse2 x264_template(predict_4x4_vl_sse2) void x264_predict_4x4_vl_sse2( uint16_t *src ); #define x264_predict_4x4_vl_avx x264_template(predict_4x4_vl_avx) void x264_predict_4x4_vl_avx( uint16_t *src ); #define x264_predict_4x4_vr_mmx2 x264_template(predict_4x4_vr_mmx2) void x264_predict_4x4_vr_mmx2( uint8_t *src ); #define x264_predict_4x4_vr_sse2 x264_template(predict_4x4_vr_sse2) void x264_predict_4x4_vr_sse2( uint16_t *src ); #define x264_predict_4x4_vr_ssse3 x264_template(predict_4x4_vr_ssse3) void x264_predict_4x4_vr_ssse3( pixel *src ); #define x264_predict_4x4_vr_cache64_ssse3 x264_template(predict_4x4_vr_cache64_ssse3) void x264_predict_4x4_vr_cache64_ssse3( uint8_t *src ); #define x264_predict_4x4_vr_avx x264_template(predict_4x4_vr_avx) void x264_predict_4x4_vr_avx( uint16_t *src ); #define x264_predict_4x4_hd_mmx2 x264_template(predict_4x4_hd_mmx2) void x264_predict_4x4_hd_mmx2( pixel *src ); #define x264_predict_4x4_hd_sse2 x264_template(predict_4x4_hd_sse2) void x264_predict_4x4_hd_sse2( uint16_t *src ); #define x264_predict_4x4_hd_ssse3 x264_template(predict_4x4_hd_ssse3) void x264_predict_4x4_hd_ssse3( pixel *src ); #define x264_predict_4x4_hd_avx x264_template(predict_4x4_hd_avx) void x264_predict_4x4_hd_avx( uint16_t *src ); #define x264_predict_4x4_dc_mmx2 x264_template(predict_4x4_dc_mmx2) void x264_predict_4x4_dc_mmx2( pixel *src ); #define x264_predict_4x4_ddr_sse2 x264_template(predict_4x4_ddr_sse2) void x264_predict_4x4_ddr_sse2( uint16_t *src ); #define x264_predict_4x4_ddr_ssse3 x264_template(predict_4x4_ddr_ssse3) void x264_predict_4x4_ddr_ssse3( pixel *src ); #define x264_predict_4x4_ddr_avx x264_template(predict_4x4_ddr_avx) void x264_predict_4x4_ddr_avx( uint16_t *src ); #define x264_predict_4x4_hu_mmx2 x264_template(predict_4x4_hu_mmx2) void x264_predict_4x4_hu_mmx2( pixel *src ); #endif x264-master/common/x86/quant-a.asm000066400000000000000000001441001502133446700170440ustar00rootroot00000000000000;***************************************************************************** ;* quant-a.asm: x86 quantization and level-run ;***************************************************************************** ;* Copyright (C) 2005-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Christian Heine ;* Oskar Arvidsson ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 64 %if HIGH_BIT_DEPTH decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15 %else dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30 dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62 %endif %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 %endmacro %macro DQM8 6 dw %1, %4, %5, %4, %1, %4, %5, %4 dw %4, %2, %6, %2, %4, %2, %6, %2 dw %5, %6, %3, %6, %5, %6, %3, %6 dw %4, %2, %6, %2, %4, %2, %6, %2 %endmacro dequant8_scale: DQM8 20, 18, 32, 19, 25, 24 DQM8 22, 19, 35, 21, 28, 26 DQM8 26, 23, 42, 24, 33, 31 DQM8 28, 25, 45, 26, 35, 33 DQM8 32, 28, 51, 30, 40, 38 DQM8 36, 32, 58, 34, 46, 43 dequant4_scale: DQM4 10, 13, 16 DQM4 11, 14, 18 DQM4 13, 16, 20 DQM4 14, 18, 23 DQM4 16, 20, 25 DQM4 18, 23, 29 decimate_mask_table4: db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24 chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1 chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0 chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1 chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1 %if HIGH_BIT_DEPTH==0 dct_coef_shuffle: %macro DCT_COEF_SHUFFLE 8 %assign y x %rep 8 %rep 7 %rotate (~(y>>7))&1 %assign y y<<((~(y>>7))&1) %endrep db %1*2 %rotate 1 %assign y y<<1 %endrep %endmacro %assign x 0 %rep 256 DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0 %assign x x+1 %endrep %endif SECTION .text cextern pb_1 cextern pw_1 cextern pw_2 cextern pw_256 cextern pd_1 cextern pb_01 cextern pd_1024 cextern deinterleave_shufd cextern popcnt_table %macro QUANT_DC_START 2 movd xm%1, r1m ; mf movd xm%2, r2m ; bias %if cpuflag(avx2) vpbroadcastdct m%1, xm%1 vpbroadcastdct m%2, xm%2 %elif HIGH_BIT_DEPTH SPLATD m%1, m%1 SPLATD m%2, m%2 %elif cpuflag(sse4) ; ssse3, but not faster on conroe mova m5, [pb_01] pshufb m%1, m5 pshufb m%2, m5 %else SPLATW m%1, m%1 SPLATW m%2, m%2 %endif %endmacro %macro QUANT_END 0 xor eax, eax %if cpuflag(sse4) ptest m5, m5 %else ; !sse4 %if ARCH_X86_64 %if mmsize == 16 packsswb m5, m5 %endif movq rcx, m5 test rcx, rcx %else %if mmsize == 16 pxor m4, m4 pcmpeqb m5, m4 pmovmskb ecx, m5 cmp ecx, (1< 0 mov t0d, 8*(%2-2*%3) %%loop: %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL] %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL] sub t0d, 16*%3 jge %%loop RET %else %if mmsize < 32 %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL] %endif %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL] RET %endif %endmacro %macro DEQUANT16_FLAT 2-5 mova m0, %1 psllw m0, m4 %assign i %0-2 %rep %0-1 %if i mova m %+ i, [r0+%2] pmullw m %+ i, m0 %else pmullw m0, [r0+%2] %endif mova [r0+%2], m %+ i %assign i i-1 %rotate 1 %endrep %endmacro %if ARCH_X86_64 DECLARE_REG_TMP 6,3,2 %else DECLARE_REG_TMP 2,0,1 %endif %macro DEQUANT_START 2 movifnidn t2d, r2m imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 lea t1d, [t0*5] sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %1 %if ARCH_X86_64 add r1, t2 ; dequant_mf[i_mf] %else add r1, r1mp ; dequant_mf[i_mf] mov r0, r0mp ; dct %endif sub t0d, %2 jl .rshift32 ; negative qbits => rightshift %endmacro ;----------------------------------------------------------------------------- ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT 3 cglobal dequant_%1x%1, 0,3,6 .skip_prologue: DEQUANT_START %2+2, %2 .lshift: movd xm2, t0d DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3 .rshift32: neg t0d mova m3, [pd_1] movd xm2, t0d pslld m3, xm2 pxor m4, m4 psrld m3, 1 DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3 %if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32) cglobal dequant_%1x%1_flat16, 0,3 movifnidn t2d, r2m %if %1 == 8 cmp t2d, 12 jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue sub t2d, 12 %endif imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 lea t1d, [t0*5] sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %2 %if ARCH_X86_64 lea r1, [dequant%1_scale] add r1, t2 %else lea r1, [dequant%1_scale + t2] %endif movifnidn r0, r0mp movd xm4, t0d %if %1 == 4 %if mmsize == 8 DEQUANT16_FLAT [r1], 0, 16 DEQUANT16_FLAT [r1+8], 8, 24 %elif mmsize == 16 DEQUANT16_FLAT [r1], 0, 16 %else vbroadcasti128 m0, [r1] psllw m0, xm4 pmullw m0, [r0] mova [r0], m0 %endif %elif mmsize == 8 DEQUANT16_FLAT [r1], 0, 8, 64, 72 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104 %elif mmsize == 16 DEQUANT16_FLAT [r1], 0, 64 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112 DEQUANT16_FLAT [r1+32], 32, 96 %else mova m1, [r1+ 0] mova m2, [r1+32] psllw m1, xm4 psllw m2, xm4 pmullw m0, m1, [r0+ 0] pmullw m3, m2, [r0+32] pmullw m4, m1, [r0+64] pmullw m5, m2, [r0+96] mova [r0+ 0], m0 mova [r0+32], m3 mova [r0+64], m4 mova [r0+96], m5 %endif RET %endif ; !HIGH_BIT_DEPTH && !AVX %endmacro ; DEQUANT %if HIGH_BIT_DEPTH INIT_XMM sse2 DEQUANT 4, 4, 2 DEQUANT 8, 6, 2 INIT_XMM xop DEQUANT 4, 4, 2 DEQUANT 8, 6, 2 INIT_YMM avx2 DEQUANT 4, 4, 4 DEQUANT 8, 6, 4 %else %if ARCH_X86_64 == 0 INIT_MMX mmx DEQUANT 4, 4, 1 DEQUANT 8, 6, 1 %endif INIT_XMM sse2 DEQUANT 4, 4, 2 DEQUANT 8, 6, 2 INIT_XMM avx DEQUANT 4, 4, 2 DEQUANT 8, 6, 2 INIT_XMM xop DEQUANT 4, 4, 2 DEQUANT 8, 6, 2 INIT_YMM avx2 DEQUANT 4, 4, 4 DEQUANT 8, 6, 4 %endif %macro DEQUANT_START_AVX512 1-2 0 ; shift, flat %if %2 == 0 movifnidn t2d, r2m %endif imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 lea t1d, [t0*5] sub t2d, t0d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %1 %if %2 %if ARCH_X86_64 %define dmf r1+t2 lea r1, [dequant8_scale] %else %define dmf t2+dequant8_scale %endif %elif ARCH_X86_64 %define dmf r1+t2 %else %define dmf r1 add r1, r1mp ; dequant_mf[i_mf] %endif movifnidn r0, r0mp %endmacro INIT_ZMM avx512 cglobal dequant_4x4, 0,3 DEQUANT_START_AVX512 6 mova m0, [dmf] %if HIGH_BIT_DEPTH pmaddwd m0, [r0] %endif sub t0d, 4 jl .rshift %if HIGH_BIT_DEPTH vpbroadcastd m1, t0d vpsllvd m0, m1 mova [r0], m0 %else vpbroadcastw ym1, t0d vpmovsdw ym0, m0 pmullw ym0, [r0] vpsllvw ym0, ym1 mova [r0], ym0 %endif RET .rshift: %if HIGH_BIT_DEPTH == 0 pmovzxwd m1, [r0] pmaddwd m0, m1 %endif mov r1d, 1<<31 shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) neg t0d vpbroadcastd m1, r1d vpbroadcastd m2, t0d paddd m0, m1 vpsravd m0, m2 %if HIGH_BIT_DEPTH mova [r0], m0 %else vpmovsdw [r0], m0 %endif RET cglobal dequant_8x8, 0,3 DEQUANT_START_AVX512 8 mova m0, [dmf+0*64] mova m1, [dmf+1*64] mova m2, [dmf+2*64] mova m3, [dmf+3*64] %if HIGH_BIT_DEPTH pmaddwd m0, [r0+0*64] pmaddwd m1, [r0+1*64] pmaddwd m2, [r0+2*64] pmaddwd m3, [r0+3*64] %else mova m6, [dequant_shuf_avx512] %endif sub t0d, 6 jl .rshift %if HIGH_BIT_DEPTH vpbroadcastd m4, t0d vpsllvd m0, m4 vpsllvd m1, m4 vpsllvd m2, m4 vpsllvd m3, m4 jmp .end .rshift: %else vpbroadcastw m4, t0d vpermt2w m0, m6, m1 vpermt2w m2, m6, m3 pmullw m0, [r0] pmullw m2, [r0+64] vpsllvw m0, m4 vpsllvw m2, m4 mova [r0], m0 mova [r0+64], m2 RET .rshift: pmovzxwd m4, [r0+0*32] pmovzxwd m5, [r0+1*32] pmaddwd m0, m4 pmaddwd m1, m5 pmovzxwd m4, [r0+2*32] pmovzxwd m5, [r0+3*32] pmaddwd m2, m4 pmaddwd m3, m5 %endif mov r1d, 1<<31 shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) neg t0d vpbroadcastd m4, r1d vpbroadcastd m5, t0d paddd m0, m4 paddd m1, m4 vpsravd m0, m5 vpsravd m1, m5 paddd m2, m4 paddd m3, m4 vpsravd m2, m5 vpsravd m3, m5 %if HIGH_BIT_DEPTH .end: mova [r0+0*64], m0 mova [r0+1*64], m1 mova [r0+2*64], m2 mova [r0+3*64], m3 %else vpermt2w m0, m6, m1 vpermt2w m2, m6, m3 mova [r0], m0 mova [r0+64], m2 %endif RET %if HIGH_BIT_DEPTH == 0 cglobal dequant_8x8_flat16, 0,3 movifnidn t2d, r2m cmp t2d, 12 jl dequant_8x8_avx512 sub t2d, 12 DEQUANT_START_AVX512 6, 1 vpbroadcastw m0, t0d mova m1, [dmf] vpsllvw m1, m0 pmullw m0, m1, [r0] pmullw m1, [r0+64] mova [r0], m0 mova [r0+64], m1 RET %endif %undef dmf %macro DEQUANT_DC 2 cglobal dequant_4x4dc, 0,3,6 DEQUANT_START 6, 6 .lshift: %if cpuflag(avx2) vpbroadcastdct m3, [r1] %else movd xm3, [r1] SPLAT%1 m3, xm3 %endif movd xm2, t0d pslld m3, xm2 %assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize %2 m0, m3, [r0+%%x] mova [r0+%%x], m0 %assign %%x %%x+mmsize %endrep RET .rshift32: neg t0d %if cpuflag(avx2) vpbroadcastdct m2, [r1] %else movd xm2, [r1] %endif mova m5, [p%1_1] movd xm3, t0d pslld m4, m5, xm3 psrld m4, 1 %if HIGH_BIT_DEPTH %if notcpuflag(avx2) pshufd m2, m2, 0 %endif %assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize pmadcswd m0, m2, [r0+%%x], m4 psrad m0, xm3 mova [r0+%%x], m0 %assign %%x %%x+mmsize %endrep %else ; !HIGH_BIT_DEPTH %if notcpuflag(avx2) PSHUFLW m2, m2, 0 %endif punpcklwd m2, m4 %assign %%x 0 %rep SIZEOF_PIXEL*32/mmsize mova m0, [r0+%%x] punpckhwd m1, m0, m5 punpcklwd m0, m5 pmaddwd m0, m2 pmaddwd m1, m2 psrad m0, xm3 psrad m1, xm3 packssdw m0, m1 mova [r0+%%x], m0 %assign %%x %%x+mmsize %endrep %endif ; !HIGH_BIT_DEPTH RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM sse2 DEQUANT_DC d, pmaddwd INIT_XMM xop DEQUANT_DC d, pmaddwd INIT_YMM avx2 DEQUANT_DC d, pmaddwd %else %if ARCH_X86_64 == 0 INIT_MMX mmx2 DEQUANT_DC w, pmullw %endif INIT_XMM sse2 DEQUANT_DC w, pmullw INIT_XMM avx DEQUANT_DC w, pmullw INIT_YMM avx2 DEQUANT_DC w, pmullw %endif %macro PEXTRW 4 %if cpuflag(sse4) pextrw %1, %2, %3 %else ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback %if %3 pextrw %4d, %2, %3 %else movd %4d, %2 %endif mov %1, %4w %endif %endmacro ;----------------------------------------------------------------------------- ; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ) ; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT_2x4_DC 1 %ifidn %1, dconly DECLARE_REG_TMP 6,3,2 %define %%args dct, dmf, qp %else DECLARE_REG_TMP 6,4,3 %define %%args dct, dct4x4, dmf, qp %endif %if ARCH_X86_64 == 0 DECLARE_REG_TMP 2,0,1 %endif cglobal idct_dequant_2x4_%1, 0,3,5, %%args movifnidn t2d, qpm imul t0d, t2d, 0x2b shr t0d, 8 ; qp / 6 lea t1d, [t0*5] sub t2d, t0d sub t2d, t1d ; qp % 6 shl t2d, 6 ; 16 * sizeof(int) %if ARCH_X86_64 imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf %else mov dctq, dctmp add t2, dmfmp imul t2d, [t2], -0xffff %endif %if HIGH_BIT_DEPTH mova m0, [dctq] mova m1, [dctq+16] SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps, packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later. %else movq m0, [dctq] movq m1, [dctq+8] SUMSUB_BA w, 1, 0, 2 punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7 %endif pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5 movd m3, t2d pshuflw m3, m3, q1000 ; + + + - SUMSUB_BA w, 0, 1, 2 punpcklqdq m3, m3 ; + + + - + + + - pshufd m1, m1, q0022 sub t0d, 6 jl .rshift movd m2, t0d psllw m3, m2 pmaddwd m0, m3 pmaddwd m1, m3 jmp .end .rshift: neg t0d movd m2, t0d pcmpeqd m4, m4 pmaddwd m0, m3 pmaddwd m1, m3 pslld m4, m2 psrad m4, 1 psubd m0, m4 ; + 1 << (qp/6-1) psubd m1, m4 psrad m0, m2 psrad m1, m2 .end: %ifidn %1, dconly %if HIGH_BIT_DEPTH mova [dctq], m0 mova [dctq+16], m1 %else packssdw m0, m1 mova [dctq], m0 %endif %else movifnidn dct4x4q, dct4x4mp %if HIGH_BIT_DEPTH movd [dct4x4q+0*64], m0 %if cpuflag(sse4) pextrd [dct4x4q+1*64], m0, 1 add dct4x4q, 4*64 pextrd [dct4x4q-2*64], m0, 2 pextrd [dct4x4q-1*64], m0, 3 movd [dct4x4q+0*64], m1 pextrd [dct4x4q+1*64], m1, 1 pextrd [dct4x4q+2*64], m1, 2 pextrd [dct4x4q+3*64], m1, 3 %else MOVHL m2, m0 psrlq m0, 32 movd [dct4x4q+1*64], m0 add dct4x4q, 4*64 movd [dct4x4q-2*64], m2 psrlq m2, 32 movd [dct4x4q-1*64], m2 movd [dct4x4q+0*64], m1 MOVHL m2, m1 psrlq m1, 32 movd [dct4x4q+1*64], m1 movd [dct4x4q+2*64], m2 psrlq m2, 32 movd [dct4x4q+3*64], m2 %endif %else PEXTRW [dct4x4q+0*32], m0, 0, eax PEXTRW [dct4x4q+1*32], m0, 2, eax PEXTRW [dct4x4q+2*32], m0, 4, eax PEXTRW [dct4x4q+3*32], m0, 6, eax add dct4x4q, 4*32 PEXTRW [dct4x4q+0*32], m1, 0, eax PEXTRW [dct4x4q+1*32], m1, 2, eax PEXTRW [dct4x4q+2*32], m1, 4, eax PEXTRW [dct4x4q+3*32], m1, 6, eax %endif %endif RET %endmacro ; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx INIT_XMM sse2 DEQUANT_2x4_DC dc DEQUANT_2x4_DC dconly INIT_XMM avx DEQUANT_2x4_DC dc DEQUANT_2x4_DC dconly ; t4 is eax for return value. %if ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX %else DECLARE_REG_TMP 4,1,2,3,0,5 %endif ;----------------------------------------------------------------------------- ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf ) ;----------------------------------------------------------------------------- %macro OPTIMIZE_CHROMA_2x2_DC 0 cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7 movifnidn t0, r0mp movd m2, r1m movq m1, [t0] %if cpuflag(sse4) pcmpeqb m4, m4 pslld m4, 11 %else pxor m4, m4 %endif %if cpuflag(ssse3) mova m3, [chroma_dc_dct_mask] mova m5, [chroma_dc_dmf_mask] %else mova m3, [chroma_dc_dct_mask_mmx] mova m5, [chroma_dc_dmf_mask_mmx] %endif pshuflw m2, m2, 0 pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2 punpcklqdq m2, m2 punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0 mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2 PSIGNW m2, m5 ; + - - + - - + + paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2 pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf punpcklwd m1, m1 psrad m2, 16 ; + - - + mov t1d, 3 paddd m0, m6 xor t4d, t4d %if notcpuflag(ssse3) psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly %endif %if cpuflag(sse4) ptest m0, m4 %else mova m6, m0 SWAP 0, 6 psrad m6, 11 pcmpeqd m6, m4 pmovmskb t5d, m6 cmp t5d, 0xffff %endif jz .ret ; if the DC coefficients already round to zero, terminate early mova m3, m0 .outer_loop: movsx t3d, word [t0+2*t1] ; dct[coeff] pshufd m6, m1, q3333 pshufd m1, m1, q2100 ; move the next element to high dword PSIGND m5, m2, m6 test t3d, t3d jz .loop_end .outer_loop_0: mov t2d, t3d sar t3d, 31 or t3d, 1 .inner_loop: psubd m3, m5 ; coeff -= sign pxor m6, m0, m3 %if cpuflag(sse4) ptest m6, m4 %else psrad m6, 11 pcmpeqd m6, m4 pmovmskb t5d, m6 cmp t5d, 0xffff %endif jz .round_coeff paddd m3, m5 ; coeff += sign mov t4d, 1 .loop_end: dec t1d jz .last_coeff pshufd m2, m2, q1320 ; - + - + / - - + + jg .outer_loop .ret: REP_RET .round_coeff: sub t2d, t3d mov [t0+2*t1], t2w jnz .inner_loop jmp .loop_end .last_coeff: movsx t3d, word [t0] punpcklqdq m2, m2 ; + + + + PSIGND m5, m2, m1 test t3d, t3d jnz .outer_loop_0 RET %endmacro %if HIGH_BIT_DEPTH == 0 INIT_XMM sse2 OPTIMIZE_CHROMA_2x2_DC INIT_XMM ssse3 OPTIMIZE_CHROMA_2x2_DC INIT_XMM sse4 OPTIMIZE_CHROMA_2x2_DC INIT_XMM avx OPTIMIZE_CHROMA_2x2_DC %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 0 cglobal denoise_dct, 4,4,6 pxor m5, m5 movsxdifnidn r3, r3d .loop: mova m2, [r0+r3*4-2*mmsize] mova m3, [r0+r3*4-1*mmsize] ABSD m0, m2 ABSD m1, m3 paddd m4, m0, [r1+r3*4-2*mmsize] psubd m0, [r2+r3*4-2*mmsize] mova [r1+r3*4-2*mmsize], m4 paddd m4, m1, [r1+r3*4-1*mmsize] psubd m1, [r2+r3*4-1*mmsize] mova [r1+r3*4-1*mmsize], m4 pcmpgtd m4, m0, m5 pand m0, m4 pcmpgtd m4, m1, m5 pand m1, m4 PSIGND m0, m2 PSIGND m1, m3 mova [r0+r3*4-2*mmsize], m0 mova [r0+r3*4-1*mmsize], m1 sub r3d, mmsize/2 jg .loop RET %endmacro %if ARCH_X86_64 == 0 INIT_MMX mmx DENOISE_DCT %endif INIT_XMM sse2 DENOISE_DCT INIT_XMM ssse3 DENOISE_DCT INIT_XMM avx DENOISE_DCT INIT_YMM avx2 DENOISE_DCT %else ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) ;----------------------------------------------------------------------------- %macro DENOISE_DCT 0 cglobal denoise_dct, 4,4,7 pxor m6, m6 movsxdifnidn r3, r3d .loop: mova m2, [r0+r3*2-2*mmsize] mova m3, [r0+r3*2-1*mmsize] ABSW m0, m2, sign ABSW m1, m3, sign psubusw m4, m0, [r2+r3*2-2*mmsize] psubusw m5, m1, [r2+r3*2-1*mmsize] PSIGNW m4, m2 PSIGNW m5, m3 mova [r0+r3*2-2*mmsize], m4 mova [r0+r3*2-1*mmsize], m5 punpcklwd m2, m0, m6 punpcklwd m3, m1, m6 punpckhwd m0, m6 punpckhwd m1, m6 paddd m2, [r1+r3*4-4*mmsize] paddd m0, [r1+r3*4-3*mmsize] paddd m3, [r1+r3*4-2*mmsize] paddd m1, [r1+r3*4-1*mmsize] mova [r1+r3*4-4*mmsize], m2 mova [r1+r3*4-3*mmsize], m0 mova [r1+r3*4-2*mmsize], m3 mova [r1+r3*4-1*mmsize], m1 sub r3, mmsize jg .loop RET %endmacro %if ARCH_X86_64 == 0 INIT_MMX mmx DENOISE_DCT %endif INIT_XMM sse2 DENOISE_DCT INIT_XMM ssse3 DENOISE_DCT INIT_XMM avx DENOISE_DCT INIT_YMM avx2 cglobal denoise_dct, 4,4,4 pxor m3, m3 movsxdifnidn r3, r3d .loop: mova m1, [r0+r3*2-mmsize] pabsw m0, m1 psubusw m2, m0, [r2+r3*2-mmsize] vpermq m0, m0, q3120 psignw m2, m1 mova [r0+r3*2-mmsize], m2 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m1, [r1+r3*4-2*mmsize] paddd m0, [r1+r3*4-1*mmsize] mova [r1+r3*4-2*mmsize], m1 mova [r1+r3*4-1*mmsize], m0 sub r3, mmsize/2 jg .loop RET %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int decimate_score( dctcoef *dct ) ;----------------------------------------------------------------------------- %macro DECIMATE_MASK 4 %if HIGH_BIT_DEPTH mova m0, [%3+0*16] packssdw m0, [%3+1*16] mova m1, [%3+2*16] packssdw m1, [%3+3*16] ABSW2 m0, m1, m0, m1, m3, m4 %else ABSW m0, [%3+ 0], m3 ABSW m1, [%3+16], m4 %endif packsswb m0, m1 pxor m2, m2 pcmpeqb m2, m0 pcmpgtb m0, %4 pmovmskb %1, m2 pmovmskb %2, m0 %endmacro %macro DECIMATE_MASK16_AVX512 0 mova m0, [r0] %if HIGH_BIT_DEPTH vptestmd k0, m0, m0 pabsd m0, m0 vpcmpud k1, m0, [pd_1] {1to16}, 6 %else vptestmw k0, m0, m0 pabsw m0, m0 vpcmpuw k1, m0, [pw_1], 6 %endif %endmacro %macro SHRX 2 %if cpuflag(bmi2) shrx %1, %1, %2 %else shr %1, %2b ; %2 has to be rcx/ecx %endif %endmacro %macro BLSR 2 %if cpuflag(bmi1) blsr %1, %2 %else lea %1, [%2-1] and %1, %2 %endif %endmacro cextern_common decimate_table4 cextern_common decimate_table8 %macro DECIMATE4x4 1 cglobal decimate_score%1, 1,3 %if cpuflag(avx512) DECIMATE_MASK16_AVX512 xor eax, eax kmovw edx, k0 %if %1 == 15 shr edx, 1 %else test edx, edx %endif jz .ret ktestw k1, k1 jnz .ret9 %else DECIMATE_MASK edx, eax, r0, [pb_1] xor edx, 0xffff jz .ret test eax, eax jnz .ret9 %if %1 == 15 shr edx, 1 %endif %endif %if ARCH_X86_64 lea r4, [decimate_mask_table4] %define mask_table r4 %else %define mask_table decimate_mask_table4 %endif movzx ecx, dl movzx eax, byte [mask_table + rcx] %if ARCH_X86_64 xor edx, ecx jz .ret %if cpuflag(lzcnt) lzcnt ecx, ecx lea r5, [decimate_table4-32] add r5, rcx %else bsr ecx, ecx lea r5, [decimate_table4-1] sub r5, rcx %endif %define table r5 %else cmp edx, ecx jz .ret bsr ecx, ecx shr edx, 1 SHRX edx, ecx %define table decimate_table4 %endif tzcnt ecx, edx shr edx, 1 SHRX edx, ecx add al, byte [table + rcx] add al, byte [mask_table + rdx] .ret: REP_RET .ret9: mov eax, 9 RET %endmacro %macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high mova m0, [r0+0*32] packsswb m0, [r0+1*32] mova m1, [r0+2*32] packsswb m1, [r0+3*32] mova m4, [pb_1] pabsb m2, m0 pabsb m3, m1 por m2, m3 ; the > 1 checks don't care about order, so ptest m4, m2 ; we can save latency by doing them here jnc .ret9 vpermq m0, m0, q3120 vpermq m1, m1, q3120 pxor m4, m4 pcmpeqb m0, m4 pcmpeqb m1, m4 pmovmskb %1, m0 pmovmskb %2, m1 %endmacro %macro DECIMATE_MASK64_AVX512 0 mova m0, [r0] %if HIGH_BIT_DEPTH packssdw m0, [r0+1*64] mova m1, [r0+2*64] packssdw m1, [r0+3*64] packsswb m0, m1 vbroadcasti32x4 m1, [pb_1] pabsb m2, m0 vpcmpub k0, m2, m1, 6 ktestq k0, k0 jnz .ret9 mova m1, [decimate_shuf_avx512] vpermd m0, m1, m0 vptestmb k1, m0, m0 %else mova m1, [r0+64] vbroadcasti32x4 m3, [pb_1] packsswb m2, m0, m1 pabsb m2, m2 vpcmpub k0, m2, m3, 6 ktestq k0, k0 jnz .ret9 vptestmw k1, m0, m0 vptestmw k2, m1, m1 %endif %endmacro %macro DECIMATE8x8 0 %if ARCH_X86_64 cglobal decimate_score64, 1,5 %if mmsize == 64 DECIMATE_MASK64_AVX512 xor eax, eax %if HIGH_BIT_DEPTH kmovq r1, k1 test r1, r1 jz .ret %else kortestd k1, k2 jz .ret kunpckdq k1, k2, k1 kmovq r1, k1 %endif %elif mmsize == 32 DECIMATE_MASK64_AVX2 r1d, eax not r1 shl rax, 32 xor r1, rax jz .ret %else mova m5, [pb_1] DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5 test eax, eax jnz .ret9 DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5 shl r2d, 16 or r1d, r2d DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5 shl r2, 32 or eax, r3d or r1, r2 DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5 not r1 shl r2, 48 xor r1, r2 jz .ret add eax, r3d jnz .ret9 %endif lea r4, [decimate_table8] mov al, -6 .loop: tzcnt rcx, r1 add al, byte [r4 + rcx] jge .ret9 shr r1, 1 SHRX r1, rcx %if cpuflag(bmi2) test r1, r1 %endif jnz .loop add al, 6 .ret: REP_RET .ret9: mov eax, 9 RET %else ; ARCH cglobal decimate_score64, 1,4 %if mmsize == 64 DECIMATE_MASK64_AVX512 xor eax, eax %if HIGH_BIT_DEPTH kshiftrq k2, k1, 32 %endif kmovd r2, k1 kmovd r3, k2 test r2, r2 jz .tryret %elif mmsize == 32 DECIMATE_MASK64_AVX2 r2, r3 xor eax, eax not r3 xor r2, -1 jz .tryret %else mova m5, [pb_1] DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5 test r1, r1 jnz .ret9 DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5 not r2 shl r3, 16 xor r2, r3 mov r0m, r2 DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5 or r2, r1 DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5 add r0, r2 jnz .ret9 mov r2, r0m not r3 shl r1, 16 xor r3, r1 test r2, r2 jz .tryret %endif mov al, -6 .loop: tzcnt ecx, r2 add al, byte [decimate_table8 + ecx] jge .ret9 sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31 shrd r2, r3, cl SHRX r3, ecx %if notcpuflag(bmi2) test r2, r2 %endif jnz .loop BLSR r2, r3 jz .end .largerun: tzcnt ecx, r3 shr r3, 1 SHRX r3, ecx .loop2: tzcnt ecx, r3 add al, byte [decimate_table8 + ecx] jge .ret9 shr r3, 1 SHRX r3, ecx .run31: test r3, r3 jnz .loop2 .end: add al, 6 RET .tryret: BLSR r2, r3 jz .ret mov al, -6 jmp .largerun .ret9: mov eax, 9 .ret: REP_RET %endif ; ARCH %endmacro INIT_XMM sse2 DECIMATE4x4 15 DECIMATE4x4 16 DECIMATE8x8 INIT_XMM ssse3 DECIMATE4x4 15 DECIMATE4x4 16 DECIMATE8x8 %if HIGH_BIT_DEPTH INIT_ZMM avx512 %else INIT_YMM avx2 DECIMATE8x8 INIT_YMM avx512 %endif DECIMATE4x4 15 DECIMATE4x4 16 INIT_ZMM avx512 DECIMATE8x8 ;----------------------------------------------------------------------------- ; int coeff_last( dctcoef *dct ) ;----------------------------------------------------------------------------- %macro BSR 3 %if cpuflag(lzcnt) lzcnt %1, %2 xor %1, %3 %else bsr %1, %2 %endif %endmacro %macro LZCOUNT 3 %if cpuflag(lzcnt) lzcnt %1, %2 %else bsr %1, %2 xor %1, %3 %endif %endmacro %if HIGH_BIT_DEPTH %macro LAST_MASK 3-4 %if %1 == 4 movq mm0, [%3] packssdw mm0, [%3+8] packsswb mm0, mm0 pcmpeqb mm0, mm2 pmovmskb %2, mm0 %elif mmsize == 16 movdqa xmm0, [%3+ 0] %if %1 == 8 packssdw xmm0, [%3+16] packsswb xmm0, xmm0 %else movdqa xmm1, [%3+32] packssdw xmm0, [%3+16] packssdw xmm1, [%3+48] packsswb xmm0, xmm1 %endif pcmpeqb xmm0, xmm2 pmovmskb %2, xmm0 %elif %1 == 8 movq mm0, [%3+ 0] movq mm1, [%3+16] packssdw mm0, [%3+ 8] packssdw mm1, [%3+24] packsswb mm0, mm1 pcmpeqb mm0, mm2 pmovmskb %2, mm0 %else movq mm0, [%3+ 0] movq mm1, [%3+16] packssdw mm0, [%3+ 8] packssdw mm1, [%3+24] movq mm3, [%3+32] movq mm4, [%3+48] packssdw mm3, [%3+40] packssdw mm4, [%3+56] packsswb mm0, mm1 packsswb mm3, mm4 pcmpeqb mm0, mm2 pcmpeqb mm3, mm2 pmovmskb %2, mm0 pmovmskb %4, mm3 shl %4, 8 or %2, %4 %endif %endmacro %macro COEFF_LAST4 0 cglobal coeff_last4, 1,3 pxor mm2, mm2 LAST_MASK 4, r1d, r0 xor r1d, 0xff shr r1d, 4 BSR eax, r1d, 0x1f RET %endmacro INIT_MMX mmx2 COEFF_LAST4 INIT_MMX lzcnt COEFF_LAST4 %macro COEFF_LAST8 0 cglobal coeff_last8, 1,3 pxor m2, m2 LAST_MASK 8, r1d, r0 %if mmsize == 16 xor r1d, 0xffff shr r1d, 8 %else xor r1d, 0xff %endif BSR eax, r1d, 0x1f RET %endmacro %if ARCH_X86_64 == 0 INIT_MMX mmx2 COEFF_LAST8 %endif INIT_XMM sse2 COEFF_LAST8 INIT_XMM lzcnt COEFF_LAST8 %else ; !HIGH_BIT_DEPTH %macro LAST_MASK 3-4 %if %1 <= 8 movq mm0, [%3+ 0] %if %1 == 4 packsswb mm0, mm0 %else packsswb mm0, [%3+ 8] %endif pcmpeqb mm0, mm2 pmovmskb %2, mm0 %elif mmsize == 16 movdqa xmm0, [%3+ 0] packsswb xmm0, [%3+16] pcmpeqb xmm0, xmm2 pmovmskb %2, xmm0 %else movq mm0, [%3+ 0] movq mm1, [%3+16] packsswb mm0, [%3+ 8] packsswb mm1, [%3+24] pcmpeqb mm0, mm2 pcmpeqb mm1, mm2 pmovmskb %2, mm0 pmovmskb %4, mm1 shl %4, 8 or %2, %4 %endif %endmacro %macro COEFF_LAST48 0 %if ARCH_X86_64 cglobal coeff_last4, 1,1 BSR rax, [r0], 0x3f shr eax, 4 RET %else cglobal coeff_last4, 0,3 mov edx, r0mp mov eax, [edx+4] xor ecx, ecx test eax, eax cmovz eax, [edx] setnz cl BSR eax, eax, 0x1f shr eax, 4 lea eax, [eax+ecx*2] RET %endif cglobal coeff_last8, 1,3 pxor m2, m2 LAST_MASK 8, r1d, r0, r2d xor r1d, 0xff BSR eax, r1d, 0x1f RET %endmacro INIT_MMX mmx2 COEFF_LAST48 INIT_MMX lzcnt COEFF_LAST48 %endif ; HIGH_BIT_DEPTH %macro COEFF_LAST 0 cglobal coeff_last15, 1,3 pxor m2, m2 LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d xor r1d, 0xffff BSR eax, r1d, 0x1f dec eax RET cglobal coeff_last16, 1,3 pxor m2, m2 LAST_MASK 16, r1d, r0, r2d xor r1d, 0xffff BSR eax, r1d, 0x1f RET %if ARCH_X86_64 == 0 cglobal coeff_last64, 1, 4-mmsize/16 pxor m2, m2 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d shl r2d, 16 or r1d, r2d xor r1d, -1 jne .secondhalf LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d shl r2d, 16 or r1d, r2d not r1d BSR eax, r1d, 0x1f RET .secondhalf: BSR eax, r1d, 0x1f add eax, 32 RET %else cglobal coeff_last64, 1,3 pxor m2, m2 LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0 LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16 shl r2d, 16 or r1d, r2d LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32 LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48 shl r0d, 16 or r2d, r0d shl r2, 32 or r1, r2 not r1 BSR rax, r1, 0x3f RET %endif %endmacro %if ARCH_X86_64 == 0 INIT_MMX mmx2 COEFF_LAST %endif INIT_XMM sse2 COEFF_LAST INIT_XMM lzcnt COEFF_LAST %macro LAST_MASK_AVX2 2 %if HIGH_BIT_DEPTH mova m0, [%2+ 0] packssdw m0, [%2+32] mova m1, [%2+64] packssdw m1, [%2+96] packsswb m0, m1 mova m1, [deinterleave_shufd] vpermd m0, m1, m0 %else mova m0, [%2+ 0] packsswb m0, [%2+32] vpermq m0, m0, q3120 %endif pcmpeqb m0, m2 pmovmskb %1, m0 %endmacro %if ARCH_X86_64 == 0 INIT_YMM avx2 cglobal coeff_last64, 1,2 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32 xor r1d, -1 jne .secondhalf LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 not r1d BSR eax, r1d, 0x1f RET .secondhalf: BSR eax, r1d, 0x1f add eax, 32 RET %else INIT_YMM avx2 cglobal coeff_last64, 1,3 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32 shl r2, 32 or r1, r2 not r1 BSR rax, r1, 0x3f RET %endif %macro COEFF_LAST_AVX512 2 ; num, w/d cglobal coeff_last%1, 1,2 mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF] vptestm%2 k0, m0, m0 %if %1 == 15 mov eax, 30 kmovw r1d, k0 lzcnt r1d, r1d sub eax, r1d %else kmovw eax, k0 lzcnt eax, eax xor eax, 31 %endif RET %endmacro %macro COEFF_LAST64_AVX512 1 ; w/d cglobal coeff_last64, 1,2 pxor xm0, xm0 vpcmp%1 k0, m0, [r0+0*64], 4 vpcmp%1 k1, m0, [r0+1*64], 4 %if HIGH_BIT_DEPTH vpcmp%1 k2, m0, [r0+2*64], 4 vpcmp%1 k3, m0, [r0+3*64], 4 kunpckwd k0, k1, k0 kunpckwd k1, k3, k2 %endif %if ARCH_X86_64 kunpckdq k0, k1, k0 kmovq rax, k0 lzcnt rax, rax xor eax, 63 %else kmovd r1d, k1 kmovd eax, k0 lzcnt r1d, r1d lzcnt eax, eax xor r1d, 32 cmovnz eax, r1d xor eax, 31 %endif RET %endmacro %if HIGH_BIT_DEPTH INIT_XMM avx512 COEFF_LAST_AVX512 4, d INIT_YMM avx512 COEFF_LAST_AVX512 8, d INIT_ZMM avx512 COEFF_LAST_AVX512 15, d COEFF_LAST_AVX512 16, d COEFF_LAST64_AVX512 d %else ; !HIGH_BIT_DEPTH INIT_XMM avx512 COEFF_LAST_AVX512 8, w INIT_YMM avx512 COEFF_LAST_AVX512 15, w COEFF_LAST_AVX512 16, w INIT_ZMM avx512 COEFF_LAST64_AVX512 w %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- struc levelrun .last: resd 1 .mask: resd 1 align 16, resb 1 .level: resw 16 endstruc ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args %if WIN64 DECLARE_REG_TMP 3,1,2,0,4,5,6 %elif ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6 %else DECLARE_REG_TMP 6,3,2,1,4,5,0 %endif %macro COEFF_LEVELRUN 1 cglobal coeff_level_run%1,0,7 movifnidn t0, r0mp movifnidn t1, r1mp pxor m2, m2 xor t3d, t3d LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d %if %1==15 shr t5d, 1 %elif %1==8 and t5d, 0xff %elif %1==4 and t5d, 0xf %endif xor t5d, (1<<%1)-1 mov [t1+levelrun.mask], t5d shl t5d, 32-%1 mov t4d, %1-1 LZCOUNT t3d, t5d, 0x1f xor t6d, t6d add t5d, t5d sub t4d, t3d shl t5d, t3b mov [t1+levelrun.last], t4d .loop: LZCOUNT t3d, t5d, 0x1f %if HIGH_BIT_DEPTH mov t2d, [t0+t4*4] %else mov t2w, [t0+t4*2] %endif inc t3d shl t5d, t3b %if HIGH_BIT_DEPTH mov [t1+t6*4+levelrun.level], t2d %else mov [t1+t6*2+levelrun.level], t2w %endif inc t6d sub t4d, t3d jge .loop RET %endmacro INIT_MMX mmx2 %if ARCH_X86_64 == 0 COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 %endif COEFF_LEVELRUN 4 COEFF_LEVELRUN 8 INIT_XMM sse2 %if HIGH_BIT_DEPTH COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 INIT_MMX lzcnt COEFF_LEVELRUN 4 %if HIGH_BIT_DEPTH == 0 COEFF_LEVELRUN 8 %endif INIT_XMM lzcnt %if HIGH_BIT_DEPTH COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 ; Similar to the one above, but saves the DCT ; coefficients in m0/m1 so we don't have to load ; them later. %macro LAST_MASK_LUT 3 pxor xm5, xm5 %if %1 <= 8 mova m0, [%3] packsswb m2, m0, m0 %else mova xm0, [%3+ 0] mova xm1, [%3+16] packsswb xm2, xm0, xm1 %if mmsize==32 vinserti128 m0, m0, xm1, 1 %endif %endif pcmpeqb xm2, xm5 pmovmskb %2, xm2 %endmacro %macro COEFF_LEVELRUN_LUT 1 cglobal coeff_level_run%1,2,4+(%1/9) %if ARCH_X86_64 lea r5, [$$] %define GLOBAL +r5-$$ %else %define GLOBAL %endif LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF %if %1==15 shr eax, 1 %elif %1==8 and eax, 0xff %elif %1==4 and eax, 0xf %endif xor eax, (1<<%1)-1 mov [r1+levelrun.mask], eax %if %1==15 add eax, eax %endif %if %1 > 8 %if ARCH_X86_64 mov r4d, eax shr r4d, 8 %else movzx r4d, ah ; first 8 bits %endif %endif movzx r2d, al ; second 8 bits shl eax, 32-%1-(%1&1) LZCOUNT eax, eax, 0x1f mov r3d, %1-1 sub r3d, eax mov [r1+levelrun.last], r3d ; Here we abuse pshufb, combined with a lookup table, to do a gather ; operation based on a bitmask. For example: ; ; dct 15-8 (input): 0 0 4 0 0 -2 1 0 ; dct 7-0 (input): 0 0 -1 0 0 0 0 15 ; bitmask 1: 0 0 1 0 0 1 1 0 ; bitmask 2: 0 0 1 0 0 0 0 1 ; gather 15-8: 4 -2 1 __ __ __ __ __ ; gather 7-0: -1 15 __ __ __ __ __ __ ; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __ ; ; The overlapping, dependent stores almost surely cause a mess of ; forwarding issues, but it's still enormously faster. %if %1 > 8 movzx eax, byte [popcnt_table+r4 GLOBAL] movzx r3d, byte [popcnt_table+r2 GLOBAL] %if mmsize==16 movh m3, [dct_coef_shuffle+r4*8 GLOBAL] movh m2, [dct_coef_shuffle+r2*8 GLOBAL] mova m4, [pw_256] ; Storing 8 bytes of shuffle constant and converting it (unpack + or) ; is neutral to slightly faster in local speed measurements, but it ; cuts the table size in half, which is surely a big cache win. punpcklbw m3, m3 punpcklbw m2, m2 por m3, m4 por m2, m4 pshufb m1, m3 pshufb m0, m2 mova [r1+levelrun.level], m1 ; This obnoxious unaligned store messes with store forwarding and ; stalls the CPU to no end, but merging the two registers before ; storing requires a variable 128-bit shift. Emulating this does ; work, but requires a lot of ops and the gain is tiny and ; inconsistent, so we'll err on the side of fewer instructions. movu [r1+rax*2+levelrun.level], m0 %else ; mmsize==32 movq xm2, [dct_coef_shuffle+r4*8 GLOBAL] vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1 punpcklbw m2, m2 por m2, [pw_256] pshufb m0, m2 vextracti128 [r1+levelrun.level], m0, 1 movu [r1+rax*2+levelrun.level], xm0 %endif add eax, r3d %else movzx eax, byte [popcnt_table+r2 GLOBAL] movh m1, [dct_coef_shuffle+r2*8 GLOBAL] punpcklbw m1, m1 por m1, [pw_256] pshufb m0, m1 mova [r1+levelrun.level], m0 %endif RET %endmacro %if HIGH_BIT_DEPTH==0 INIT_MMX ssse3 COEFF_LEVELRUN_LUT 4 INIT_XMM ssse3 COEFF_LEVELRUN_LUT 8 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 INIT_MMX ssse3, lzcnt COEFF_LEVELRUN_LUT 4 INIT_XMM ssse3, lzcnt COEFF_LEVELRUN_LUT 8 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 INIT_XMM avx2 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 %endif x264-master/common/x86/quant.h000066400000000000000000000445311502133446700163040ustar00rootroot00000000000000/***************************************************************************** * quant.h: x86 quantization and level-run ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser * Christian Heine * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_QUANT_H #define X264_X86_QUANT_H #define x264_quant_2x2_dc_mmx2 x264_template(quant_2x2_dc_mmx2) int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias ); #define x264_quant_4x4_dc_mmx2 x264_template(quant_4x4_dc_mmx2) int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_mmx2 x264_template(quant_4x4_mmx2) int x264_quant_4x4_mmx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_8x8_mmx2 x264_template(quant_8x8_mmx2) int x264_quant_8x8_mmx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_quant_2x2_dc_sse2 x264_template(quant_2x2_dc_sse2) int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_dc_sse2 x264_template(quant_4x4_dc_sse2) int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_sse2 x264_template(quant_4x4_sse2) int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_4x4x4_sse2 x264_template(quant_4x4x4_sse2) int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_8x8_sse2 x264_template(quant_8x8_sse2) int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_quant_2x2_dc_ssse3 x264_template(quant_2x2_dc_ssse3) int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias ); #define x264_quant_4x4_dc_ssse3 x264_template(quant_4x4_dc_ssse3) int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_ssse3 x264_template(quant_4x4_ssse3) int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_4x4x4_ssse3 x264_template(quant_4x4x4_ssse3) int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_8x8_ssse3 x264_template(quant_8x8_ssse3) int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_quant_2x2_dc_sse4 x264_template(quant_2x2_dc_sse4) int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_dc_sse4 x264_template(quant_4x4_dc_sse4) int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_sse4 x264_template(quant_4x4_sse4) int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_4x4x4_sse4 x264_template(quant_4x4x4_sse4) int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_8x8_sse4 x264_template(quant_8x8_sse4) int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_quant_4x4_avx2 x264_template(quant_4x4_avx2) int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_4x4_dc_avx2 x264_template(quant_4x4_dc_avx2) int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias ); #define x264_quant_8x8_avx2 x264_template(quant_8x8_avx2) int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_quant_4x4x4_avx2 x264_template(quant_4x4x4_avx2) int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); #define x264_dequant_4x4_mmx x264_template(dequant_4x4_mmx) void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4dc_mmx2 x264_template(dequant_4x4dc_mmx2) void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_mmx x264_template(dequant_8x8_mmx) void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_sse2 x264_template(dequant_4x4_sse2) void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4dc_sse2 x264_template(dequant_4x4dc_sse2) void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_sse2 x264_template(dequant_8x8_sse2) void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_avx x264_template(dequant_4x4_avx) void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4dc_avx x264_template(dequant_4x4dc_avx) void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_avx x264_template(dequant_8x8_avx) void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_xop x264_template(dequant_4x4_xop) void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4dc_xop x264_template(dequant_4x4dc_xop) void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_xop x264_template(dequant_8x8_xop) void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_avx2 x264_template(dequant_4x4_avx2) void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4dc_avx2 x264_template(dequant_4x4dc_avx2) void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_avx2 x264_template(dequant_8x8_avx2) void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_avx512 x264_template(dequant_4x4_avx512) void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_avx512 x264_template(dequant_8x8_avx512) void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_flat16_mmx x264_template(dequant_4x4_flat16_mmx) void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_flat16_mmx x264_template(dequant_8x8_flat16_mmx) void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_flat16_sse2 x264_template(dequant_4x4_flat16_sse2) void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_flat16_sse2 x264_template(dequant_8x8_flat16_sse2) void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_4x4_flat16_avx2 x264_template(dequant_4x4_flat16_avx2) void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_flat16_avx2 x264_template(dequant_8x8_flat16_avx2) void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #define x264_dequant_8x8_flat16_avx512 x264_template(dequant_8x8_flat16_avx512) void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp ); #define x264_idct_dequant_2x4_dc_sse2 x264_template(idct_dequant_2x4_dc_sse2) void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); #define x264_idct_dequant_2x4_dc_avx x264_template(idct_dequant_2x4_dc_avx) void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); #define x264_idct_dequant_2x4_dconly_sse2 x264_template(idct_dequant_2x4_dconly_sse2) void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); #define x264_idct_dequant_2x4_dconly_avx x264_template(idct_dequant_2x4_dconly_avx) void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); #define x264_optimize_chroma_2x2_dc_sse2 x264_template(optimize_chroma_2x2_dc_sse2) int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf ); #define x264_optimize_chroma_2x2_dc_ssse3 x264_template(optimize_chroma_2x2_dc_ssse3) int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf ); #define x264_optimize_chroma_2x2_dc_sse4 x264_template(optimize_chroma_2x2_dc_sse4) int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf ); #define x264_optimize_chroma_2x2_dc_avx x264_template(optimize_chroma_2x2_dc_avx) int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf ); #define x264_denoise_dct_mmx x264_template(denoise_dct_mmx) void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); #define x264_denoise_dct_sse2 x264_template(denoise_dct_sse2) void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); #define x264_denoise_dct_ssse3 x264_template(denoise_dct_ssse3) void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); #define x264_denoise_dct_avx x264_template(denoise_dct_avx) void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); #define x264_denoise_dct_avx2 x264_template(denoise_dct_avx2) void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); #define x264_decimate_score15_sse2 x264_template(decimate_score15_sse2) int x264_decimate_score15_sse2( dctcoef *dct ); #define x264_decimate_score15_ssse3 x264_template(decimate_score15_ssse3) int x264_decimate_score15_ssse3( dctcoef *dct ); #define x264_decimate_score15_avx512 x264_template(decimate_score15_avx512) int x264_decimate_score15_avx512( dctcoef *dct ); #define x264_decimate_score16_sse2 x264_template(decimate_score16_sse2) int x264_decimate_score16_sse2( dctcoef *dct ); #define x264_decimate_score16_ssse3 x264_template(decimate_score16_ssse3) int x264_decimate_score16_ssse3( dctcoef *dct ); #define x264_decimate_score16_avx512 x264_template(decimate_score16_avx512) int x264_decimate_score16_avx512( dctcoef *dct ); #define x264_decimate_score64_sse2 x264_template(decimate_score64_sse2) int x264_decimate_score64_sse2( dctcoef *dct ); #define x264_decimate_score64_ssse3 x264_template(decimate_score64_ssse3) int x264_decimate_score64_ssse3( dctcoef *dct ); #define x264_decimate_score64_avx2 x264_template(decimate_score64_avx2) int x264_decimate_score64_avx2( int16_t *dct ); #define x264_decimate_score64_avx512 x264_template(decimate_score64_avx512) int x264_decimate_score64_avx512( dctcoef *dct ); #define x264_coeff_last4_mmx2 x264_template(coeff_last4_mmx2) int x264_coeff_last4_mmx2( dctcoef *dct ); #define x264_coeff_last8_mmx2 x264_template(coeff_last8_mmx2) int x264_coeff_last8_mmx2( dctcoef *dct ); #define x264_coeff_last15_mmx2 x264_template(coeff_last15_mmx2) int x264_coeff_last15_mmx2( dctcoef *dct ); #define x264_coeff_last16_mmx2 x264_template(coeff_last16_mmx2) int x264_coeff_last16_mmx2( dctcoef *dct ); #define x264_coeff_last64_mmx2 x264_template(coeff_last64_mmx2) int x264_coeff_last64_mmx2( dctcoef *dct ); #define x264_coeff_last8_sse2 x264_template(coeff_last8_sse2) int x264_coeff_last8_sse2( dctcoef *dct ); #define x264_coeff_last15_sse2 x264_template(coeff_last15_sse2) int x264_coeff_last15_sse2( dctcoef *dct ); #define x264_coeff_last16_sse2 x264_template(coeff_last16_sse2) int x264_coeff_last16_sse2( dctcoef *dct ); #define x264_coeff_last64_sse2 x264_template(coeff_last64_sse2) int x264_coeff_last64_sse2( dctcoef *dct ); #define x264_coeff_last4_lzcnt x264_template(coeff_last4_lzcnt) int x264_coeff_last4_lzcnt( dctcoef *dct ); #define x264_coeff_last8_lzcnt x264_template(coeff_last8_lzcnt) int x264_coeff_last8_lzcnt( dctcoef *dct ); #define x264_coeff_last15_lzcnt x264_template(coeff_last15_lzcnt) int x264_coeff_last15_lzcnt( dctcoef *dct ); #define x264_coeff_last16_lzcnt x264_template(coeff_last16_lzcnt) int x264_coeff_last16_lzcnt( dctcoef *dct ); #define x264_coeff_last64_lzcnt x264_template(coeff_last64_lzcnt) int x264_coeff_last64_lzcnt( dctcoef *dct ); #define x264_coeff_last64_avx2 x264_template(coeff_last64_avx2) int x264_coeff_last64_avx2 ( dctcoef *dct ); #define x264_coeff_last4_avx512 x264_template(coeff_last4_avx512) int x264_coeff_last4_avx512( int32_t *dct ); #define x264_coeff_last8_avx512 x264_template(coeff_last8_avx512) int x264_coeff_last8_avx512( dctcoef *dct ); #define x264_coeff_last15_avx512 x264_template(coeff_last15_avx512) int x264_coeff_last15_avx512( dctcoef *dct ); #define x264_coeff_last16_avx512 x264_template(coeff_last16_avx512) int x264_coeff_last16_avx512( dctcoef *dct ); #define x264_coeff_last64_avx512 x264_template(coeff_last64_avx512) int x264_coeff_last64_avx512( dctcoef *dct ); #define x264_coeff_level_run16_mmx2 x264_template(coeff_level_run16_mmx2) int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run16_sse2 x264_template(coeff_level_run16_sse2) int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run16_lzcnt x264_template(coeff_level_run16_lzcnt) int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run16_ssse3 x264_template(coeff_level_run16_ssse3) int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run16_ssse3_lzcnt x264_template(coeff_level_run16_ssse3_lzcnt) int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run16_avx2 x264_template(coeff_level_run16_avx2) int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run15_mmx2 x264_template(coeff_level_run15_mmx2) int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run15_sse2 x264_template(coeff_level_run15_sse2) int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run15_lzcnt x264_template(coeff_level_run15_lzcnt) int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run15_ssse3 x264_template(coeff_level_run15_ssse3) int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run15_ssse3_lzcnt x264_template(coeff_level_run15_ssse3_lzcnt) int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run15_avx2 x264_template(coeff_level_run15_avx2) int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run4_mmx2 x264_template(coeff_level_run4_mmx2) int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run4_lzcnt x264_template(coeff_level_run4_lzcnt) int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run4_ssse3 x264_template(coeff_level_run4_ssse3) int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run4_ssse3_lzcnt x264_template(coeff_level_run4_ssse3_lzcnt) int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run8_mmx2 x264_template(coeff_level_run8_mmx2) int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run8_lzcnt x264_template(coeff_level_run8_lzcnt) int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run8_sse2 x264_template(coeff_level_run8_sse2) int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run8_ssse3 x264_template(coeff_level_run8_ssse3) int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_coeff_level_run8_ssse3_lzcnt x264_template(coeff_level_run8_ssse3_lzcnt) int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); #define x264_trellis_cabac_4x4_sse2 x264_template(trellis_cabac_4x4_sse2) int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); #define x264_trellis_cabac_4x4_ssse3 x264_template(trellis_cabac_4x4_ssse3) int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac ); #define x264_trellis_cabac_8x8_sse2 x264_template(trellis_cabac_8x8_sse2) int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced ); #define x264_trellis_cabac_8x8_ssse3 x264_template(trellis_cabac_8x8_ssse3) int x264_trellis_cabac_8x8_ssse3( TRELLIS_PARAMS, int b_interlaced ); #define x264_trellis_cabac_4x4_psy_sse2 x264_template(trellis_cabac_4x4_psy_sse2) int x264_trellis_cabac_4x4_psy_sse2 ( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis ); #define x264_trellis_cabac_4x4_psy_ssse3 x264_template(trellis_cabac_4x4_psy_ssse3) int x264_trellis_cabac_4x4_psy_ssse3( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis ); #define x264_trellis_cabac_8x8_psy_sse2 x264_template(trellis_cabac_8x8_psy_sse2) int x264_trellis_cabac_8x8_psy_sse2 ( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis ); #define x264_trellis_cabac_8x8_psy_ssse3 x264_template(trellis_cabac_8x8_psy_ssse3) int x264_trellis_cabac_8x8_psy_ssse3( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis ); #define x264_trellis_cabac_dc_sse2 x264_template(trellis_cabac_dc_sse2) int x264_trellis_cabac_dc_sse2 ( TRELLIS_PARAMS, int i_coefs ); #define x264_trellis_cabac_dc_ssse3 x264_template(trellis_cabac_dc_ssse3) int x264_trellis_cabac_dc_ssse3( TRELLIS_PARAMS, int i_coefs ); #define x264_trellis_cabac_chroma_422_dc_sse2 x264_template(trellis_cabac_chroma_422_dc_sse2) int x264_trellis_cabac_chroma_422_dc_sse2 ( TRELLIS_PARAMS ); #define x264_trellis_cabac_chroma_422_dc_ssse3 x264_template(trellis_cabac_chroma_422_dc_ssse3) int x264_trellis_cabac_chroma_422_dc_ssse3( TRELLIS_PARAMS ); #endif x264-master/common/x86/sad-a.asm000066400000000000000000001511321502133446700164660ustar00rootroot00000000000000;***************************************************************************** ;* sad-a.asm: x86 sad functions ;***************************************************************************** ;* Copyright (C) 2003-2025 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* Laurent Aimar ;* Alex Izvorski ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA 32 pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1 hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11 SECTION .text cextern pb_3 cextern pb_shuf8x8c cextern pw_8 cextern sw_64 ;============================================================================= ; SAD MMX ;============================================================================= %macro SAD_INC_2x16P 0 movq mm1, [r0] movq mm2, [r0+8] movq mm3, [r0+r1] movq mm4, [r0+r1+8] psadbw mm1, [r2] psadbw mm2, [r2+8] psadbw mm3, [r2+r3] psadbw mm4, [r2+r3+8] lea r0, [r0+2*r1] paddw mm1, mm2 paddw mm3, mm4 lea r2, [r2+2*r3] paddw mm0, mm1 paddw mm0, mm3 %endmacro %macro SAD_INC_2x8P 0 movq mm1, [r0] movq mm2, [r0+r1] psadbw mm1, [r2] psadbw mm2, [r2+r3] lea r0, [r0+2*r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] %endmacro %macro SAD_INC_2x4P 0 movd mm1, [r0] movd mm2, [r2] punpckldq mm1, [r0+r1] punpckldq mm2, [r2+r3] psadbw mm1, mm2 paddw mm0, mm1 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD 2 cglobal pixel_sad_%1x%2_mmx2, 4,4 pxor mm0, mm0 %rep %2/2 SAD_INC_2x%1P %endrep movd eax, mm0 RET %endmacro SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 SAD 4, 16 SAD 4, 8 SAD 4, 4 ;============================================================================= ; SAD XMM ;============================================================================= %macro SAD_END_SSE2 0 MOVHL m1, m0 paddw m0, m1 movd eax, m0 RET %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD_W16 1 ; h cglobal pixel_sad_16x%1, 4,4 %ifidn cpuname, sse2 .skip_prologue: %endif %assign %%i 0 %if ARCH_X86_64 lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile lea r5, [3*r3] %rep %1/4 movu m1, [r2] psadbw m1, [r0] movu m3, [r2+r3] psadbw m3, [r0+r1] movu m2, [r2+2*r3] psadbw m2, [r0+2*r1] movu m4, [r2+r5] psadbw m4, [r0+r6] %if %%i != %1/4-1 lea r2, [r2+4*r3] lea r0, [r0+4*r1] %endif paddw m1, m3 paddw m2, m4 ACCUM paddw, 0, 1, %%i paddw m0, m2 %assign %%i %%i+1 %endrep %else ; The cost of having to save and restore registers on x86-32 %rep %1/2 ; nullifies the benefit of having 3*stride in registers. movu m1, [r2] psadbw m1, [r0] movu m2, [r2+r3] psadbw m2, [r0+r1] %if %%i != %1/2-1 lea r2, [r2+2*r3] lea r0, [r0+2*r1] %endif ACCUM paddw, 0, 1, %%i paddw m0, m2 %assign %%i %%i+1 %endrep %endif SAD_END_SSE2 %endmacro INIT_XMM sse2 SAD_W16 16 SAD_W16 8 INIT_XMM sse3 SAD_W16 16 SAD_W16 8 INIT_XMM sse2, aligned SAD_W16 16 SAD_W16 8 %macro SAD_INC_4x8P_SSE 1 movq m1, [r0] movq m2, [r0+r1] lea r0, [r0+2*r1] movq m3, [r2] movq m4, [r2+r3] lea r2, [r2+2*r3] movhps m1, [r0] movhps m2, [r0+r1] movhps m3, [r2] movhps m4, [r2+r3] lea r0, [r0+2*r1] psadbw m1, m3 psadbw m2, m4 lea r2, [r2+2*r3] ACCUM paddw, 0, 1, %1 paddw m0, m2 %endmacro INIT_XMM ;Even on Nehalem, no sizes other than 8x16 benefit from this method. cglobal pixel_sad_8x16_sse2, 4,4 SAD_INC_4x8P_SSE 0 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 %macro SAD_W48_AVX512 3 ; w, h, d/q cglobal pixel_sad_%1x%2, 4,4 kxnorb k1, k1, k1 kaddb k1, k1, k1 %assign %%i 0 %if ARCH_X86_64 && %2 != 4 lea r6, [3*r1] lea r5, [3*r3] %rep %2/4 mov%3 m1, [r0] vpbroadcast%3 m1 {k1}, [r0+r1] mov%3 m3, [r2] vpbroadcast%3 m3 {k1}, [r2+r3] mov%3 m2, [r0+2*r1] vpbroadcast%3 m2 {k1}, [r0+r6] mov%3 m4, [r2+2*r3] vpbroadcast%3 m4 {k1}, [r2+r5] %if %%i != %2/4-1 lea r0, [r0+4*r1] lea r2, [r2+4*r3] %endif psadbw m1, m3 psadbw m2, m4 ACCUM paddd, 0, 1, %%i paddd m0, m2 %assign %%i %%i+1 %endrep %else %rep %2/2 mov%3 m1, [r0] vpbroadcast%3 m1 {k1}, [r0+r1] mov%3 m2, [r2] vpbroadcast%3 m2 {k1}, [r2+r3] %if %%i != %2/2-1 lea r0, [r0+2*r1] lea r2, [r2+2*r3] %endif psadbw m1, m2 ACCUM paddd, 0, 1, %%i %assign %%i %%i+1 %endrep %endif %if %1 == 8 punpckhqdq m1, m0, m0 paddd m0, m1 %endif movd eax, m0 RET %endmacro INIT_XMM avx512 SAD_W48_AVX512 4, 4, d SAD_W48_AVX512 4, 8, d SAD_W48_AVX512 4, 16, d SAD_W48_AVX512 8, 4, q SAD_W48_AVX512 8, 8, q SAD_W48_AVX512 8, 16, q %macro SAD_W16_AVX512_START 1 ; h cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory lea r1, [3*r3] %endmacro %macro SAD_W16_AVX512_END 0 paddd m0, m1 paddd m0, m2 paddd m0, m3 %if mmsize == 64 vextracti32x8 ym1, m0, 1 paddd ym0, ym1 %endif vextracti128 xm1, ym0, 1 paddd xmm0, xm0, xm1 punpckhqdq xmm1, xmm0, xmm0 paddd xmm0, xmm1 movd eax, xmm0 RET %endmacro INIT_YMM avx512 cglobal pixel_sad_16x8, 4,4 SAD_W16_AVX512_START 8 movu xm0, [r2] vinserti128 m0, [r2+r3], 1 psadbw m0, [r0+0*32] movu xm1, [r2+2*r3] vinserti128 m1, [r2+r1], 1 lea r2, [r2+4*r3] psadbw m1, [r0+1*32] movu xm2, [r2] vinserti128 m2, [r2+r3], 1 psadbw m2, [r0+2*32] movu xm3, [r2+2*r3] vinserti128 m3, [r2+r1], 1 psadbw m3, [r0+3*32] SAD_W16_AVX512_END INIT_ZMM avx512 cglobal pixel_sad_16x16, 4,4 SAD_W16_AVX512_START 16 movu xm0, [r2] vinserti128 ym0, [r2+r3], 1 movu xm1, [r2+4*r3] vinserti32x4 m0, [r2+2*r3], 2 vinserti32x4 m1, [r2+2*r1], 2 vinserti32x4 m0, [r2+r1], 3 lea r2, [r2+4*r3] vinserti32x4 m1, [r2+r3], 1 psadbw m0, [r0+0*64] vinserti32x4 m1, [r2+r1], 3 lea r2, [r2+4*r3] psadbw m1, [r0+1*64] movu xm2, [r2] vinserti128 ym2, [r2+r3], 1 movu xm3, [r2+4*r3] vinserti32x4 m2, [r2+2*r3], 2 vinserti32x4 m3, [r2+2*r1], 2 vinserti32x4 m2, [r2+r1], 3 lea r2, [r2+4*r3] vinserti32x4 m3, [r2+r3], 1 psadbw m2, [r0+2*64] vinserti32x4 m3, [r2+r1], 3 psadbw m3, [r0+3*64] SAD_W16_AVX512_END ;----------------------------------------------------------------------------- ; void pixel_vsad( pixel *src, intptr_t stride ); ;----------------------------------------------------------------------------- %if ARCH_X86_64 == 0 INIT_MMX cglobal pixel_vsad_mmx2, 3,3 mova m0, [r0] mova m1, [r0+8] mova m2, [r0+r1] mova m3, [r0+r1+8] lea r0, [r0+r1*2] psadbw m0, m2 psadbw m1, m3 paddw m0, m1 sub r2d, 2 je .end .loop: mova m4, [r0] mova m5, [r0+8] mova m6, [r0+r1] mova m7, [r0+r1+8] lea r0, [r0+r1*2] psadbw m2, m4 psadbw m3, m5 psadbw m4, m6 psadbw m5, m7 ;max sum: 31*16*255(pixel_max)=126480 paddd m0, m2 paddd m0, m3 paddd m0, m4 paddd m0, m5 mova m2, m6 mova m3, m7 sub r2d, 2 jg .loop .end: movd eax, m0 RET %endif INIT_XMM cglobal pixel_vsad_sse2, 3,3 mova m0, [r0] mova m1, [r0+r1] lea r0, [r0+r1*2] psadbw m0, m1 sub r2d, 2 je .end .loop: mova m2, [r0] mova m3, [r0+r1] lea r0, [r0+r1*2] psadbw m1, m2 psadbw m2, m3 paddw m0, m1 paddw m0, m2 mova m1, m3 sub r2d, 2 jg .loop .end: MOVHL m1, m0 ;max sum: 31*16*255(pixel_max)=126480 paddd m0, m1 movd eax, m0 RET ;----------------------------------------------------------------------------- ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- cglobal intra_sad_x3_4x4_mmx2, 3,3 pxor mm7, mm7 movd mm0, [r1-FDEC_STRIDE] movd mm1, [r0+FENC_STRIDE*0] movd mm2, [r0+FENC_STRIDE*2] punpckldq mm0, mm0 punpckldq mm1, [r0+FENC_STRIDE*1] punpckldq mm2, [r0+FENC_STRIDE*3] movq mm6, mm0 movq mm3, mm1 psadbw mm3, mm0 psadbw mm0, mm2 paddw mm0, mm3 movd [r2], mm0 ;V prediction cost movd mm3, [r1+FDEC_STRIDE*0-4] movd mm0, [r1+FDEC_STRIDE*1-4] movd mm4, [r1+FDEC_STRIDE*2-4] movd mm5, [r1+FDEC_STRIDE*3-4] punpcklbw mm3, mm0 punpcklbw mm4, mm5 movq mm5, mm3 punpckhwd mm5, mm4 punpckhdq mm5, mm6 psadbw mm5, mm7 punpckhbw mm3, mm3 punpckhbw mm4, mm4 punpckhwd mm3, mm3 punpckhwd mm4, mm4 psraw mm5, 2 pavgw mm5, mm7 punpcklbw mm5, mm5 pshufw mm5, mm5, 0 ;DC prediction movq mm6, mm5 psadbw mm5, mm1 psadbw mm6, mm2 psadbw mm1, mm3 psadbw mm2, mm4 paddw mm5, mm6 paddw mm1, mm2 movd [r2+8], mm5 ;DC prediction cost movd [r2+4], mm1 ;H prediction cost RET ;----------------------------------------------------------------------------- ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]); ;----------------------------------------------------------------------------- ;m0 = DC ;m6 = V ;m7 = H ;m1 = DC score ;m2 = V score ;m3 = H score ;m5 = pixel row ;m4 = temp %macro INTRA_SAD_HVDC_ITER 2 movq m5, [r0+FENC_STRIDE*%1] movq m4, m5 psadbw m4, m0 ACCUM paddw, 1, 4, %1 movq m4, m5 psadbw m4, m6 ACCUM paddw, 2, 4, %1 pshufw m4, m7, %2 psadbw m5, m4 ACCUM paddw, 3, 5, %1 %endmacro INIT_MMX cglobal intra_sad_x3_8x8_mmx2, 3,3 movq m7, [r1+7] pxor m0, m0 movq m6, [r1+16] ;V prediction pxor m1, m1 psadbw m0, m7 psadbw m1, m6 paddw m0, m1 paddw m0, [pw_8] psrlw m0, 4 punpcklbw m0, m0 pshufw m0, m0, q0000 ;DC prediction punpckhbw m7, m7 INTRA_SAD_HVDC_ITER 0, q3333 INTRA_SAD_HVDC_ITER 1, q2222 INTRA_SAD_HVDC_ITER 2, q1111 INTRA_SAD_HVDC_ITER 3, q0000 movq m7, [r1+7] punpcklbw m7, m7 INTRA_SAD_HVDC_ITER 4, q3333 INTRA_SAD_HVDC_ITER 5, q2222 INTRA_SAD_HVDC_ITER 6, q1111 INTRA_SAD_HVDC_ITER 7, q0000 movd [r2+0], m2 movd [r2+4], m3 movd [r2+8], m1 RET ;----------------------------------------------------------------------------- ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- %macro INTRA_SAD_HV_ITER 1 %if cpuflag(ssse3) movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4] movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4] pshufb m1, m7 pshufb m3, m7 %else movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8] movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8] punpckhbw m1, m1 punpckhbw m3, m3 pshufw m1, m1, q3333 pshufw m3, m3, q3333 %endif movq m4, [r0 + FENC_STRIDE*(%1+0)] movq m5, [r0 + FENC_STRIDE*(%1+1)] psadbw m1, m4 psadbw m3, m5 psadbw m4, m6 psadbw m5, m6 paddw m1, m3 paddw m4, m5 ACCUM paddw, 0, 1, %1 ACCUM paddw, 2, 4, %1 %endmacro %macro INTRA_SAD_8x8C 0 cglobal intra_sad_x3_8x8c, 3,3 movq m6, [r1 - FDEC_STRIDE] add r1, FDEC_STRIDE*4 %if cpuflag(ssse3) movq m7, [pb_3] %endif INTRA_SAD_HV_ITER 0 INTRA_SAD_HV_ITER 2 INTRA_SAD_HV_ITER 4 INTRA_SAD_HV_ITER 6 movd [r2+4], m0 movd [r2+8], m2 pxor m7, m7 movq m2, [r1 + FDEC_STRIDE*-4 - 8] movq m4, [r1 + FDEC_STRIDE*-2 - 8] movq m3, [r1 + FDEC_STRIDE* 0 - 8] movq m5, [r1 + FDEC_STRIDE* 2 - 8] punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8] punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8] punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8] punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8] punpckhbw m2, m4 punpckhbw m3, m5 psrlq m2, 32 psrlq m3, 32 psadbw m2, m7 ; s2 psadbw m3, m7 ; s3 movq m1, m6 SWAP 0, 6 punpckldq m0, m7 punpckhdq m1, m7 psadbw m0, m7 ; s0 psadbw m1, m7 ; s1 punpcklwd m0, m1 punpcklwd m2, m3 punpckldq m0, m2 ;s0 s1 s2 s3 pshufw m3, m0, q3312 ;s2,s1,s3,s3 pshufw m0, m0, q1310 ;s0,s1,s3,s1 paddw m0, m3 psrlw m0, 2 pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 %if cpuflag(ssse3) movq2dq xmm0, m0 pshufb xmm0, [pb_shuf8x8c] movq xmm1, [r0+FENC_STRIDE*0] movq xmm2, [r0+FENC_STRIDE*1] movq xmm3, [r0+FENC_STRIDE*2] movq xmm4, [r0+FENC_STRIDE*3] movhps xmm1, [r0+FENC_STRIDE*4] movhps xmm2, [r0+FENC_STRIDE*5] movhps xmm3, [r0+FENC_STRIDE*6] movhps xmm4, [r0+FENC_STRIDE*7] psadbw xmm1, xmm0 psadbw xmm2, xmm0 psadbw xmm3, xmm0 psadbw xmm4, xmm0 paddw xmm1, xmm2 paddw xmm1, xmm3 paddw xmm1, xmm4 MOVHL xmm0, xmm1 paddw xmm1, xmm0 movd [r2], xmm1 %else packuswb m0, m0 punpcklbw m0, m0 movq m1, m0 punpcklbw m0, m0 ; 4x dc0 4x dc1 punpckhbw m1, m1 ; 4x dc2 4x dc3 movq m2, [r0+FENC_STRIDE*0] movq m3, [r0+FENC_STRIDE*1] movq m4, [r0+FENC_STRIDE*2] movq m5, [r0+FENC_STRIDE*3] movq m6, [r0+FENC_STRIDE*4] movq m7, [r0+FENC_STRIDE*5] psadbw m2, m0 psadbw m3, m0 psadbw m4, m0 psadbw m5, m0 movq m0, [r0+FENC_STRIDE*6] psadbw m6, m1 psadbw m7, m1 psadbw m0, m1 psadbw m1, [r0+FENC_STRIDE*7] paddw m2, m3 paddw m4, m5 paddw m6, m7 paddw m0, m1 paddw m2, m4 paddw m6, m0 paddw m2, m6 movd [r2], m2 %endif RET %endmacro INIT_MMX mmx2 INTRA_SAD_8x8C INIT_MMX ssse3 INTRA_SAD_8x8C INIT_YMM avx2 cglobal intra_sad_x3_8x8c, 3,3,7 vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred add r1, FDEC_STRIDE*4-1 pxor xm5, xm5 punpckldq xm3, xm2, xm5 ; V0 _ V1 _ movd xm0, [r1 + FDEC_STRIDE*-1 - 3] movd xm1, [r1 + FDEC_STRIDE* 3 - 3] pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0 pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0 pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1 pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1 pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2 pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2 punpcklqdq xm0, xm1 ; H0 _ H1 _ vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1 pshufb xm0, [hpred_shuf] ; H00224466 H11335577 psadbw m3, m5 ; s0 s1 s2 s3 vpermq m4, m3, q3312 ; s2 s1 s3 s3 vpermq m3, m3, q1310 ; s0 s1 s3 s1 paddw m3, m4 psrlw m3, 2 pavgw m3, m5 ; s0+s2 s1 s3 s1+s3 pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _ vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V vpermq m0, m0, q3120 ; H00224466 _ H11335577 _ movddup m2, [r0+FENC_STRIDE*0] movddup m4, [r0+FENC_STRIDE*2] pshuflw m3, m0, q0000 psadbw m3, m2 psadbw m2, m1 pshuflw m5, m0, q1111 psadbw m5, m4 psadbw m4, m1 paddw m2, m4 paddw m3, m5 movddup m4, [r0+FENC_STRIDE*4] pshuflw m5, m0, q2222 psadbw m5, m4 psadbw m4, m6 paddw m2, m4 paddw m3, m5 movddup m4, [r0+FENC_STRIDE*6] pshuflw m5, m0, q3333 psadbw m5, m4 psadbw m4, m6 paddw m2, m4 paddw m3, m5 vextracti128 xm0, m2, 1 vextracti128 xm1, m3, 1 paddw xm2, xm0 ; DC V paddw xm3, xm1 ; H pextrd [r2+8], xm2, 2 ; V movd [r2+4], xm3 ; H movd [r2+0], xm2 ; DC RET ;----------------------------------------------------------------------------- ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score %macro INTRA_SAD16 0 cglobal intra_sad_x3_16x16, 3,5,8 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1-FDEC_STRIDE+0] psadbw mm1, [r1-FDEC_STRIDE+8] paddw mm0, mm1 movd r3d, mm0 %if cpuflag(ssse3) mova m1, [pb_3] %endif %assign x 0 %rep 16 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)] %if (x&3)==3 && x!=15 add r1, FDEC_STRIDE*4 %endif add r3d, r4d %assign x x+1 %endrep sub r1, FDEC_STRIDE*12 add r3d, 16 shr r3d, 5 imul r3d, 0x01010101 movd m7, r3d mova m5, [r1-FDEC_STRIDE] %if mmsize==16 pshufd m7, m7, 0 %else mova m1, [r1-FDEC_STRIDE+8] punpckldq m7, m7 %endif pxor m4, m4 pxor m3, m3 pxor m2, m2 mov r3d, 15*FENC_STRIDE .vloop: SPLATB_LOAD m6, r1+r3*2-1, m1 mova m0, [r0+r3] psadbw m0, m7 paddw m4, m0 mova m0, [r0+r3] psadbw m0, m5 paddw m2, m0 %if mmsize==8 mova m0, [r0+r3] psadbw m0, m6 paddw m3, m0 mova m0, [r0+r3+8] psadbw m0, m7 paddw m4, m0 mova m0, [r0+r3+8] psadbw m0, m1 paddw m2, m0 psadbw m6, [r0+r3+8] paddw m3, m6 %else psadbw m6, [r0+r3] paddw m3, m6 %endif add r3d, -FENC_STRIDE jge .vloop %if mmsize==16 pslldq m3, 4 por m3, m2 MOVHL m1, m3 paddw m3, m1 movq [r2+0], m3 MOVHL m1, m4 paddw m4, m1 %else movd [r2+0], m2 movd [r2+4], m3 %endif movd [r2+8], m4 RET %endmacro INIT_MMX mmx2 INTRA_SAD16 INIT_XMM sse2 INTRA_SAD16 INIT_XMM ssse3 INTRA_SAD16 INIT_YMM avx2 cglobal intra_sad_x3_16x16, 3,5,6 pxor xm0, xm0 psadbw xm0, [r1-FDEC_STRIDE] MOVHL xm1, xm0 paddw xm0, xm1 movd r3d, xm0 %assign x 0 %rep 16 movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)] %if (x&3)==3 && x!=15 add r1, FDEC_STRIDE*4 %endif add r3d, r4d %assign x x+1 %endrep sub r1, FDEC_STRIDE*12 add r3d, 16 shr r3d, 5 movd xm5, r3d vpbroadcastb xm5, xm5 vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction pxor m4, m4 ; DC / V accumulator pxor xm3, xm3 ; H accumulator mov r3d, 15*FENC_STRIDE .vloop: vpbroadcastb xm2, [r1+r3*2-1] vbroadcasti128 m0, [r0+r3] psadbw m1, m0, m5 psadbw xm0, xm2 paddw m4, m1 paddw xm3, xm0 add r3d, -FENC_STRIDE jge .vloop punpckhqdq m5, m4, m4 MOVHL xm2, xm3 paddw m4, m5 ; DC / V paddw xm3, xm2 ; H vextracti128 xm2, m4, 1 movd [r2+0], xm2 movd [r2+4], xm3 movd [r2+8], xm4 RET ;============================================================================= ; SAD x3/x4 MMX ;============================================================================= %macro SAD_X3_START_1x8P 0 movq mm3, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] psadbw mm0, mm3 psadbw mm1, mm3 psadbw mm2, mm3 %endmacro %macro SAD_X3_1x8P 2 movq mm3, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm3 psadbw mm5, mm3 psadbw mm6, mm3 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endmacro %macro SAD_X3_START_2x4P 3 movd mm3, [r0] movd %1, [r1] movd %2, [r2] movd %3, [r3] punpckldq mm3, [r0+FENC_STRIDE] punpckldq %1, [r1+r4] punpckldq %2, [r2+r4] punpckldq %3, [r3+r4] psadbw %1, mm3 psadbw %2, mm3 psadbw %3, mm3 %endmacro %macro SAD_X3_2x16P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P 8, 8 SAD_X3_1x8P FENC_STRIDE, r4 SAD_X3_1x8P FENC_STRIDE+8, r4+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_2x8P 1 %if %1 SAD_X3_START_1x8P %else SAD_X3_1x8P 0, 0 %endif SAD_X3_1x8P FENC_STRIDE, r4 add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X3_2x4P 1 %if %1 SAD_X3_START_2x4P mm0, mm1, mm2 %else SAD_X3_START_2x4P mm4, mm5, mm6 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r4] lea r2, [r2+2*r4] lea r3, [r3+2*r4] %endmacro %macro SAD_X4_START_1x8P 0 movq mm7, [r0] movq mm0, [r1] movq mm1, [r2] movq mm2, [r3] movq mm3, [r4] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_1x8P 2 movq mm7, [r0+%1] movq mm4, [r1+%2] movq mm5, [r2+%2] movq mm6, [r3+%2] psadbw mm4, mm7 psadbw mm5, mm7 psadbw mm6, mm7 psadbw mm7, [r4+%2] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7 %endmacro %macro SAD_X4_START_2x4P 0 movd mm7, [r0] movd mm0, [r1] movd mm1, [r2] movd mm2, [r3] movd mm3, [r4] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm0, [r1+r5] punpckldq mm1, [r2+r5] punpckldq mm2, [r3+r5] punpckldq mm3, [r4+r5] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_INC_2x4P 0 movd mm7, [r0] movd mm4, [r1] movd mm5, [r2] punpckldq mm7, [r0+FENC_STRIDE] punpckldq mm4, [r1+r5] punpckldq mm5, [r2+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm0, mm4 paddw mm1, mm5 movd mm4, [r3] movd mm5, [r4] punpckldq mm4, [r3+r5] punpckldq mm5, [r4+r5] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm2, mm4 paddw mm3, mm5 %endmacro %macro SAD_X4_2x16P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P 8, 8 SAD_X4_1x8P FENC_STRIDE, r5 SAD_X4_1x8P FENC_STRIDE+8, r5+8 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X4_2x8P 1 %if %1 SAD_X4_START_1x8P %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P FENC_STRIDE, r5 add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X4_2x4P 1 %if %1 SAD_X4_START_2x4P %else SAD_X4_INC_2x4P %endif add r0, 2*FENC_STRIDE lea r1, [r1+2*r5] lea r2, [r2+2*r5] lea r3, [r3+2*r5] lea r4, [r4+2*r5] %endmacro %macro SAD_X3_END 0 %if UNIX64 movd [r5+0], mm0 movd [r5+4], mm1 movd [r5+8], mm2 %else mov r0, r5mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 %endif RET %endmacro %macro SAD_X4_END 0 mov r0, r6mp movd [r0+0], mm0 movd [r0+4], mm1 movd [r0+8], mm2 movd [r0+12], mm3 RET %endmacro ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 %endrep SAD_X%1_END %endmacro INIT_MMX SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 SAD_X 3, 4, 8 SAD_X 3, 4, 4 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 SAD_X 4, 4, 8 SAD_X 4, 4, 4 ;============================================================================= ; SAD x3/x4 XMM ;============================================================================= %macro SAD_X3_START_1x16P_SSE2 0 mova m2, [r0] %if cpuflag(avx) psadbw m0, m2, [r1] psadbw m1, m2, [r2] psadbw m2, [r3] %else movu m0, [r1] movu m1, [r2] movu m3, [r3] psadbw m0, m2 psadbw m1, m2 psadbw m2, m3 %endif %endmacro %macro SAD_X3_1x16P_SSE2 2 mova m3, [r0+%1] %if cpuflag(avx) psadbw m4, m3, [r1+%2] psadbw m5, m3, [r2+%2] psadbw m3, [r3+%2] %else movu m4, [r1+%2] movu m5, [r2+%2] movu m6, [r3+%2] psadbw m4, m3 psadbw m5, m3 psadbw m3, m6 %endif paddw m0, m4 paddw m1, m5 paddw m2, m3 %endmacro %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 5 %endif %macro SAD_X3_4x16P_SSE2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_1x16P_SSE2 %else SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0 %endif SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1 SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2 SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X3_START_2x8P_SSE2 0 movq m3, [r0] movq m0, [r1] movq m1, [r2] movq m2, [r3] movhps m3, [r0+FENC_STRIDE] movhps m0, [r1+r4] movhps m1, [r2+r4] movhps m2, [r3+r4] psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 %endmacro %macro SAD_X3_2x8P_SSE2 4 movq m6, [r0+%1] movq m3, [r1+%2] movq m4, [r2+%2] movq m5, [r3+%2] movhps m6, [r0+%3] movhps m3, [r1+%4] movhps m4, [r2+%4] movhps m5, [r3+%4] psadbw m3, m6 psadbw m4, m6 psadbw m5, m6 paddw m0, m3 paddw m1, m4 paddw m2, m5 %endmacro %macro SAD_X4_START_2x8P_SSE2 0 movq m4, [r0] movq m0, [r1] movq m1, [r2] movq m2, [r3] movq m3, [r4] movhps m4, [r0+FENC_STRIDE] movhps m0, [r1+r5] movhps m1, [r2+r5] movhps m2, [r3+r5] movhps m3, [r4+r5] psadbw m0, m4 psadbw m1, m4 psadbw m2, m4 psadbw m3, m4 %endmacro %macro SAD_X4_2x8P_SSE2 4 movq m6, [r0+%1] movq m4, [r1+%2] movq m5, [r2+%2] movhps m6, [r0+%3] movhps m4, [r1+%4] movhps m5, [r2+%4] psadbw m4, m6 psadbw m5, m6 paddw m0, m4 paddw m1, m5 movq m4, [r3+%2] movq m5, [r4+%2] movhps m4, [r3+%4] movhps m5, [r4+%4] psadbw m4, m6 psadbw m5, m6 paddw m2, m4 paddw m3, m5 %endmacro %macro SAD_X4_START_1x16P_SSE2 0 mova m3, [r0] %if cpuflag(avx) psadbw m0, m3, [r1] psadbw m1, m3, [r2] psadbw m2, m3, [r3] psadbw m3, [r4] %else movu m0, [r1] movu m1, [r2] movu m2, [r3] movu m4, [r4] psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 psadbw m3, m4 %endif %endmacro %macro SAD_X4_1x16P_SSE2 2 mova m6, [r0+%1] %if cpuflag(avx) psadbw m4, m6, [r1+%2] psadbw m5, m6, [r2+%2] %else movu m4, [r1+%2] movu m5, [r2+%2] psadbw m4, m6 psadbw m5, m6 %endif paddw m0, m4 paddw m1, m5 %if cpuflag(avx) psadbw m4, m6, [r3+%2] psadbw m5, m6, [r4+%2] %else movu m4, [r3+%2] movu m5, [r4+%2] psadbw m4, m6 psadbw m5, m6 %endif paddw m2, m4 paddw m3, m5 %endmacro %macro SAD_X4_4x16P_SSE2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_1x16P_SSE2 %else SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0 %endif SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1 SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2 SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_4x8P_SSE2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_2x8P_SSE2 %else SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1 %endif SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X4_4x8P_SSE2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x8P_SSE2 %else SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_END_SSE2 0 movifnidn r5, r5mp %if cpuflag(ssse3) packssdw m0, m1 packssdw m2, m2 phaddd m0, m2 mova [r5], m0 %else movhlps m3, m0 movhlps m4, m1 movhlps m5, m2 paddw m0, m3 paddw m1, m4 paddw m2, m5 movd [r5+0], m0 movd [r5+4], m1 movd [r5+8], m2 %endif RET %endmacro %macro SAD_X4_END_SSE2 0 mov r0, r6mp %if cpuflag(ssse3) packssdw m0, m1 packssdw m2, m3 phaddd m0, m2 mova [r0], m0 %else psllq m1, 32 psllq m3, 32 paddw m0, m1 paddw m2, m3 movhlps m1, m0 movhlps m3, m2 paddw m0, m1 paddw m2, m3 movq [r0+0], m0 movq [r0+8], m2 %endif RET %endmacro %macro SAD_X4_START_2x8P_SSSE3 0 movddup m4, [r0] movq m0, [r1] movq m1, [r3] movhps m0, [r2] movhps m1, [r4] movddup m5, [r0+FENC_STRIDE] movq m2, [r1+r5] movq m3, [r3+r5] movhps m2, [r2+r5] movhps m3, [r4+r5] psadbw m0, m4 psadbw m1, m4 psadbw m2, m5 psadbw m3, m5 paddw m0, m2 paddw m1, m3 %endmacro %macro SAD_X4_2x8P_SSSE3 4 movddup m6, [r0+%1] movq m2, [r1+%2] movq m3, [r3+%2] movhps m2, [r2+%2] movhps m3, [r4+%2] movddup m7, [r0+%3] movq m4, [r1+%4] movq m5, [r3+%4] movhps m4, [r2+%4] movhps m5, [r4+%4] psadbw m2, m6 psadbw m3, m6 psadbw m4, m7 psadbw m5, m7 paddw m0, m2 paddw m1, m3 paddw m0, m4 paddw m1, m5 %endmacro %macro SAD_X4_4x8P_SSSE3 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x8P_SSSE3 %else SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X4_END_SSSE3 0 mov r0, r6mp packssdw m0, m1 mova [r0], m0 RET %endmacro %macro SAD_X3_START_2x16P_AVX2 0 movu m3, [r0] ; assumes FENC_STRIDE == 16 movu xm0, [r1] movu xm1, [r2] movu xm2, [r3] vinserti128 m0, m0, [r1+r4], 1 vinserti128 m1, m1, [r2+r4], 1 vinserti128 m2, m2, [r3+r4], 1 psadbw m0, m3 psadbw m1, m3 psadbw m2, m3 %endmacro %macro SAD_X3_2x16P_AVX2 3 movu m3, [r0+%1] ; assumes FENC_STRIDE == 16 movu xm4, [r1+%2] movu xm5, [r2+%2] movu xm6, [r3+%2] vinserti128 m4, m4, [r1+%3], 1 vinserti128 m5, m5, [r2+%3], 1 vinserti128 m6, m6, [r3+%3], 1 psadbw m4, m3 psadbw m5, m3 psadbw m6, m3 paddw m0, m4 paddw m1, m5 paddw m2, m6 %endmacro %macro SAD_X3_4x16P_AVX2 2 %if %1==0 lea t0, [r4*3] SAD_X3_START_2x16P_AVX2 %else SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1 %endif SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endif %endmacro %macro SAD_X4_START_2x16P_AVX2 0 vbroadcasti128 m4, [r0] vbroadcasti128 m5, [r0+FENC_STRIDE] movu xm0, [r1] movu xm1, [r2] movu xm2, [r1+r5] movu xm3, [r2+r5] vinserti128 m0, m0, [r3], 1 vinserti128 m1, m1, [r4], 1 vinserti128 m2, m2, [r3+r5], 1 vinserti128 m3, m3, [r4+r5], 1 psadbw m0, m4 psadbw m1, m4 psadbw m2, m5 psadbw m3, m5 paddw m0, m2 paddw m1, m3 %endmacro %macro SAD_X4_2x16P_AVX2 4 vbroadcasti128 m6, [r0+%1] vbroadcasti128 m7, [r0+%3] movu xm2, [r1+%2] movu xm3, [r2+%2] movu xm4, [r1+%4] movu xm5, [r2+%4] vinserti128 m2, m2, [r3+%2], 1 vinserti128 m3, m3, [r4+%2], 1 vinserti128 m4, m4, [r3+%4], 1 vinserti128 m5, m5, [r4+%4], 1 psadbw m2, m6 psadbw m3, m6 psadbw m4, m7 psadbw m5, m7 paddw m0, m2 paddw m1, m3 paddw m0, m4 paddw m1, m5 %endmacro %macro SAD_X4_4x16P_AVX2 2 %if %1==0 lea r6, [r5*3] SAD_X4_START_2x16P_AVX2 %else SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1 %endif SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6 %if %1 != %2-1 %if (%1&1) != 0 add r0, 8*FENC_STRIDE %endif lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endif %endmacro %macro SAD_X3_END_AVX2 0 movifnidn r5, r5mp packssdw m0, m1 ; 0 0 1 1 0 0 1 1 packssdw m2, m2 ; 2 2 _ _ 2 2 _ _ phaddd m0, m2 ; 0 1 2 _ 0 1 2 _ vextracti128 xm1, m0, 1 paddd xm0, xm1 ; 0 1 2 _ mova [r5], xm0 RET %endmacro %macro SAD_X4_END_AVX2 0 mov r0, r6mp packssdw m0, m1 ; 0 0 1 1 2 2 3 3 vextracti128 xm1, m0, 1 phaddd xm0, xm1 ; 0 1 2 3 mova [r0], xm0 RET %endmacro ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 4 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 %assign x 0 %rep %3/4 SAD_X%1_4x%2P_SSE2 x, %3/4 %assign x x+1 %endrep SAD_X%1_END_SSE2 %endmacro INIT_XMM sse2 SAD_X_SSE2 3, 16, 16, 7 SAD_X_SSE2 3, 16, 8, 7 SAD_X_SSE2 3, 8, 16, 7 SAD_X_SSE2 3, 8, 8, 7 SAD_X_SSE2 3, 8, 4, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 8, 7 SAD_X_SSE2 4, 8, 16, 7 SAD_X_SSE2 4, 8, 8, 7 SAD_X_SSE2 4, 8, 4, 7 INIT_XMM sse3 SAD_X_SSE2 3, 16, 16, 7 SAD_X_SSE2 3, 16, 8, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 8, 7 %macro SAD_X_SSSE3 3 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8 %assign x 0 %rep %3/4 SAD_X%1_4x%2P_SSSE3 x, %3/4 %assign x x+1 %endrep SAD_X%1_END_SSSE3 %endmacro INIT_XMM ssse3 SAD_X_SSE2 3, 16, 16, 7 SAD_X_SSE2 3, 16, 8, 7 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 8, 7 SAD_X_SSSE3 4, 8, 16 SAD_X_SSSE3 4, 8, 8 SAD_X_SSSE3 4, 8, 4 INIT_XMM avx SAD_X_SSE2 3, 16, 16, 6 SAD_X_SSE2 3, 16, 8, 6 SAD_X_SSE2 4, 16, 16, 7 SAD_X_SSE2 4, 16, 8, 7 %macro SAD_X_AVX2 4 cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4 %assign x 0 %rep %3/4 SAD_X%1_4x%2P_AVX2 x, %3/4 %assign x x+1 %endrep SAD_X%1_END_AVX2 %endmacro INIT_YMM avx2 SAD_X_AVX2 3, 16, 16, 7 SAD_X_AVX2 3, 16, 8, 7 SAD_X_AVX2 4, 16, 16, 8 SAD_X_AVX2 4, 16, 8, 8 %macro SAD_X_W4_AVX512 2 ; x, h cglobal pixel_sad_x%1_4x%2, %1+2,%1+3 mov t1d, 0xa kmovb k1, t1d lea t1, [3*t0] kaddb k2, k1, k1 kshiftlb k3, k1, 2 %assign %%i 0 %rep %2/4 movu m6, [r0+%%i*64] vmovddup m6 {k1}, [r0+%%i*64+32] movd xmm2, [r1] movd xmm4, [r1+t0] vpbroadcastd xmm2 {k1}, [r1+2*t0] vpbroadcastd xmm4 {k1}, [r1+t1] vpbroadcastd xmm2 {k2}, [r2+t0] vpbroadcastd xmm4 {k2}, [r2] vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3 vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2 vpmovqd s1, m6 ; s0 s2 s1 s3 movd xmm3, [r3] movd xmm5, [r3+t0] vpbroadcastd xmm3 {k1}, [r3+2*t0] vpbroadcastd xmm5 {k1}, [r3+t1] %if %1 == 4 vpbroadcastd xmm3 {k2}, [r4+t0] vpbroadcastd xmm5 {k2}, [r4] vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3 vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2 %endif %if %%i != %2/4-1 %assign %%j 1 %rep %1 lea r%+%%j, [r%+%%j+4*t0] %assign %%j %%j+1 %endrep %endif pshufd s2, s1, q1032 psadbw xmm2, s1 psadbw xmm4, s2 psadbw xmm3, s1 psadbw xmm5, s2 %if %%i paddd xmm0, xmm2 paddd xmm1, xmm3 paddd xmm0, xmm4 paddd xmm1, xmm5 %else paddd xmm0, xmm2, xmm4 paddd xmm1, xmm3, xmm5 %endif %assign %%i %%i+1 %endrep %if %1 == 4 movifnidn t2, r6mp %else movifnidn t2, r5mp %endif packusdw xmm0, xmm1 mova [t2], xmm0 RET %endmacro %macro SAD_X_W8_AVX512 2 ; x, h cglobal pixel_sad_x%1_8x%2, %1+2,%1+3 kxnorb k3, k3, k3 lea t1, [3*t0] kaddb k1, k3, k3 kshiftlb k2, k3, 2 kshiftlb k3, k3, 3 %assign %%i 0 %rep %2/4 movddup m6, [r0+%%i*64] ; s0 s0 s1 s1 movq xm2, [r1] movq xm4, [r1+2*t0] vpbroadcastq xm2 {k1}, [r2] vpbroadcastq xm4 {k1}, [r2+2*t0] vpbroadcastq m2 {k2}, [r1+t0] vpbroadcastq m4 {k2}, [r1+t1] vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1 vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3 movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3 movq xm3, [r3] movq xm5, [r3+2*t0] %if %1 == 4 vpbroadcastq xm3 {k1}, [r4] vpbroadcastq xm5 {k1}, [r4+2*t0] %endif vpbroadcastq m3 {k2}, [r3+t0] vpbroadcastq m5 {k2}, [r3+t1] %if %1 == 4 vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1 vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3 %endif %if %%i != %2/4-1 %assign %%j 1 %rep %1 lea r%+%%j, [r%+%%j+4*t0] %assign %%j %%j+1 %endrep %endif psadbw m2, m6 psadbw m4, m7 psadbw m3, m6 psadbw m5, m7 ACCUM paddd, 0, 2, %%i ACCUM paddd, 1, 3, %%i paddd m0, m4 paddd m1, m5 %assign %%i %%i+1 %endrep %if %1 == 4 movifnidn t2, r6mp %else movifnidn t2, r5mp %endif packusdw m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm1 mova [t2], xm0 RET %endmacro %macro SAD_X_W16_AVX512 2 ; x, h cglobal pixel_sad_x%1_16x%2, %1+2,%1+3 lea t1, [3*t0] %assign %%i 0 %rep %2/4 mova m6, [r0+%%i*64] ; s0 s1 s2 s3 movu xm2, [r3] movu xm4, [r3+t0] %if %1 == 4 vinserti128 ym2, [r4+t0], 1 vinserti128 ym4, [r4], 1 %endif vinserti32x4 m2, [r1+2*t0], 2 vinserti32x4 m4, [r1+t1], 2 vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3 vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2 vpermq m7, m6, q1032 ; s1 s0 s3 s2 movu xm3, [r1] movu xm5, [r1+t0] vinserti128 ym3, [r2+t0], 1 vinserti128 ym5, [r2], 1 vinserti32x4 m3, [r3+2*t0], 2 vinserti32x4 m5, [r3+t1], 2 %if %1 == 4 vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3 vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2 %endif %if %%i != %2/4-1 %assign %%j 1 %rep %1 lea r%+%%j, [r%+%%j+4*t0] %assign %%j %%j+1 %endrep %endif psadbw m2, m6 psadbw m4, m7 psadbw m3, m6 psadbw m5, m7 ACCUM paddd, 0, 2, %%i ACCUM paddd, 1, 3, %%i paddd m0, m4 paddd m1, m5 %assign %%i %%i+1 %endrep %if %1 == 4 movifnidn t2, r6mp %else movifnidn t2, r5mp %endif mov t1d, 0x1111 kmovw k1, t1d vshufi32x4 m0, m0, q1032 paddd m0, m1 punpckhqdq m1, m0, m0 paddd m0, m1 vpcompressd m0 {k1}{z}, m0 mova [t2], xm0 RET %endmacro ; t0 = stride, t1 = tmp/stride3, t2 = scores %if WIN64 %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but %define s2 xmm17 ; they're callee-saved on win64 DECLARE_REG_TMP 4, 6, 0 %else %define s1 xmm6 %define s2 xmm7 %if ARCH_X86_64 DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64 %else DECLARE_REG_TMP 4, 5, 0 %endif %endif INIT_YMM avx512 SAD_X_W4_AVX512 3, 4 ; x3_4x4 SAD_X_W4_AVX512 3, 8 ; x3_4x8 SAD_X_W8_AVX512 3, 4 ; x3_8x4 SAD_X_W8_AVX512 3, 8 ; x3_8x8 SAD_X_W8_AVX512 3, 16 ; x3_8x16 INIT_ZMM avx512 SAD_X_W16_AVX512 3, 8 ; x3_16x8 SAD_X_W16_AVX512 3, 16 ; x3_16x16 DECLARE_REG_TMP 5, 6, 0 INIT_YMM avx512 SAD_X_W4_AVX512 4, 4 ; x4_4x4 SAD_X_W4_AVX512 4, 8 ; x4_4x8 SAD_X_W8_AVX512 4, 4 ; x4_8x4 SAD_X_W8_AVX512 4, 8 ; x4_8x8 SAD_X_W8_AVX512 4, 16 ; x4_8x16 INIT_ZMM avx512 SAD_X_W16_AVX512 4, 8 ; x4_16x8 SAD_X_W16_AVX512 4, 16 ; x4_16x16 ;============================================================================= ; SAD cacheline split ;============================================================================= ; Core2 (Conroe) can load unaligned data just as quickly as aligned data... ; unless the unaligned data spans the border between 2 cachelines, in which ; case it's really slow. The exact numbers may differ, but all Intel cpus prior ; to Nehalem have a large penalty for cacheline splits. ; (8-byte alignment exactly half way between two cachelines is ok though.) ; LDDQU was supposed to fix this, but it only works on Pentium 4. ; So in the split case we load aligned data and explicitly perform the ; alignment between registers. Like on archs that have only aligned loads, ; except complicated by the fact that PALIGNR takes only an immediate, not ; a variable alignment. ; It is also possible to hoist the realignment to the macroblock level (keep ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory ; needed for that method makes it often slower. ; sad 16x16 costs on Core2: ; good offsets: 49 cycles (50/64 of all mvs) ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) ; computed jump assumes this loop is exactly 80 bytes %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment ALIGN 16 sad_w16_align%1_sse2: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] movdqa xmm3, [r2] movdqa xmm4, [r2+r3] pslldq xmm1, 16-%1 pslldq xmm2, 16-%1 psrldq xmm3, %1 psrldq xmm4, %1 por xmm1, xmm3 por xmm2, xmm4 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_sse2 ret %endmacro ; computed jump assumes this loop is exactly 64 bytes %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment ALIGN 16 sad_w16_align%1_ssse3: movdqa xmm1, [r2+16] movdqa xmm2, [r2+r3+16] palignr xmm1, [r2], %1 palignr xmm2, [r2+r3], %1 psadbw xmm1, [r0] psadbw xmm2, [r0+r1] paddw xmm0, xmm1 paddw xmm0, xmm2 lea r0, [r0+2*r1] lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_ssse3 ret %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height cglobal pixel_sad_16x%2_cache64_%1 mov eax, r2m and eax, 0x37 cmp eax, 0x30 jle pixel_sad_16x%2_sse2 PROLOGUE 4,6 mov r4d, r2d and r4d, 15 %ifidn %1, ssse3 shl r4d, 6 ; code size = 64 %else lea r4, [r4*5] shl r4d, 4 ; code size = 80 %endif %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1)) %if ARCH_X86_64 lea r5, [sad_w16_addr] add r5, r4 %else lea r5, [sad_w16_addr + r4] %endif and r2, ~15 mov r4d, %2/2 pxor xmm0, xmm0 call r5 MOVHL xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 RET %endmacro %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline mov eax, r2m and eax, 0x17|%1|(%4>>1) cmp eax, 0x10|%1|(%4>>1) jle pixel_sad_%1x%2_mmx2 and eax, 7 shl eax, 3 movd mm6, [sw_64] movd mm7, eax psubw mm6, mm7 PROLOGUE 4,5 and r2, ~7 mov r4d, %3 pxor mm0, mm0 %endmacro %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline cglobal pixel_sad_16x%1_cache%2_mmx2 SAD_CACHELINE_START_MMX2 16, %1, %1, %2 .loop: movq mm1, [r2] movq mm2, [r2+8] movq mm3, [r2+16] movq mm4, mm2 psrlq mm1, mm7 psllq mm2, mm6 psllq mm3, mm6 psrlq mm4, mm7 por mm1, mm2 por mm3, mm4 psadbw mm1, [r0] psadbw mm3, [r0+8] paddw mm0, mm1 paddw mm0, mm3 add r2, r3 add r0, r1 dec r4 jg .loop movd eax, mm0 RET %endmacro %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline cglobal pixel_sad_8x%1_cache%2_mmx2 SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 .loop: movq mm1, [r2+8] movq mm2, [r2+r3+8] movq mm3, [r2] movq mm4, [r2+r3] psllq mm1, mm6 psllq mm2, mm6 psrlq mm3, mm7 psrlq mm4, mm7 por mm1, mm3 por mm2, mm4 psadbw mm1, [r0] psadbw mm2, [r0+r1] paddw mm0, mm1 paddw mm0, mm2 lea r2, [r2+2*r3] lea r0, [r0+2*r1] dec r4 jg .loop movd eax, mm0 RET %endmacro ; sad_x3/x4_cache64: check each mv. ; if they're all within a cacheline, use normal sad_x3/x4. ; otherwise, send them individually to sad_cache64. %macro CHECK_SPLIT 3 ; pix, width, cacheline mov eax, %1 and eax, 0x17|%2|(%3>>1) cmp eax, 0x10|%2|(%3>>1) jg .split %endmacro %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name cglobal pixel_sad_x3_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 jmp pixel_sad_x3_%1x%2_%4 .split: %if ARCH_X86_64 PROLOGUE 6,9 push r3 push r2 %if WIN64 movsxd r4, r4d sub rsp, 40 ; shadow space and alignment %endif mov r2, r1 mov r1, FENC_STRIDE mov r3, r4 mov r7, r0 mov r8, r5 call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 mov r2, [rsp+40+0*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 mov r2, [rsp+40+1*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 add rsp, 40+2*8 %endif RET %else push edi mov edi, [esp+28] push dword [esp+24] push dword [esp+16] push dword 16 push dword [esp+20] call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov [edi+8], eax add esp, 16 pop edi ret %endif %endmacro %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name cglobal pixel_sad_x4_%1x%2_cache%3_%6 CHECK_SPLIT r1m, %1, %3 CHECK_SPLIT r2m, %1, %3 CHECK_SPLIT r3m, %1, %3 CHECK_SPLIT r4m, %1, %3 jmp pixel_sad_x4_%1x%2_%4 .split: %if ARCH_X86_64 PROLOGUE 6,9 mov r8, r6mp push r4 push r3 push r2 %if WIN64 sub rsp, 32 ; shadow space %endif mov r2, r1 mov r1, FENC_STRIDE mov r3, r5 mov r7, r0 call pixel_sad_%1x%2_cache%3_%5 mov [r8], eax %if WIN64 mov r2, [rsp+32+0*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+4], eax %if WIN64 mov r2, [rsp+32+1*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+8], eax %if WIN64 mov r2, [rsp+32+2*8] %else pop r2 %endif mov r0, r7 call pixel_sad_%1x%2_cache%3_%5 mov [r8+12], eax %if WIN64 add rsp, 32+3*8 %endif RET %else push edi mov edi, [esp+32] push dword [esp+28] push dword [esp+16] push dword 16 push dword [esp+20] call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+32] mov [edi], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+36] mov [edi+4], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov ecx, [esp+40] mov [edi+8], eax mov [esp+8], ecx call pixel_sad_%1x%2_cache%3_%5 mov [edi+12], eax add esp, 16 pop edi ret %endif %endmacro %macro SADX34_CACHELINE_FUNC 1+ SADX3_CACHELINE_FUNC %1 SADX4_CACHELINE_FUNC %1 %endmacro ; instantiate the aligned sads INIT_MMX %if ARCH_X86_64 == 0 SAD16_CACHELINE_FUNC_MMX2 8, 32 SAD16_CACHELINE_FUNC_MMX2 16, 32 SAD8_CACHELINE_FUNC_MMX2 4, 32 SAD8_CACHELINE_FUNC_MMX2 8, 32 SAD8_CACHELINE_FUNC_MMX2 16, 32 SAD16_CACHELINE_FUNC_MMX2 8, 64 SAD16_CACHELINE_FUNC_MMX2 16, 64 %endif ; !ARCH_X86_64 SAD8_CACHELINE_FUNC_MMX2 4, 64 SAD8_CACHELINE_FUNC_MMX2 8, 64 SAD8_CACHELINE_FUNC_MMX2 16, 64 %if ARCH_X86_64 == 0 SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2 %endif ; !ARCH_X86_64 SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2 SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2 %if ARCH_X86_64 == 0 SAD16_CACHELINE_FUNC sse2, 8 SAD16_CACHELINE_FUNC sse2, 16 %assign i 1 %rep 15 SAD16_CACHELINE_LOOP_SSE2 i %assign i i+1 %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2 %endif ; !ARCH_X86_64 SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2 SAD16_CACHELINE_FUNC ssse3, 8 SAD16_CACHELINE_FUNC ssse3, 16 %assign i 1 %rep 15 SAD16_CACHELINE_LOOP_SSSE3 i %assign i i+1 %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3 x264-master/common/x86/sad16-a.asm000066400000000000000000000410501502133446700166320ustar00rootroot00000000000000;***************************************************************************** ;* sad16-a.asm: x86 high depth sad functions ;***************************************************************************** ;* Copyright (C) 2010-2025 x264 project ;* ;* Authors: Oskar Arvidsson ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text cextern pw_1 cextern pw_4 cextern pw_8 ;============================================================================= ; SAD MMX ;============================================================================= %macro SAD_INC_1x16P_MMX 0 movu m1, [r0+ 0] movu m2, [r0+ 8] movu m3, [r0+16] movu m4, [r0+24] psubw m1, [r2+ 0] psubw m2, [r2+ 8] psubw m3, [r2+16] psubw m4, [r2+24] ABSW2 m1, m2, m1, m2, m5, m6 ABSW2 m3, m4, m3, m4, m7, m5 lea r0, [r0+2*r1] lea r2, [r2+2*r3] paddw m1, m2 paddw m3, m4 paddw m0, m1 paddw m0, m3 %endmacro %macro SAD_INC_2x8P_MMX 0 movu m1, [r0+0] movu m2, [r0+8] movu m3, [r0+2*r1+0] movu m4, [r0+2*r1+8] psubw m1, [r2+0] psubw m2, [r2+8] psubw m3, [r2+2*r3+0] psubw m4, [r2+2*r3+8] ABSW2 m1, m2, m1, m2, m5, m6 ABSW2 m3, m4, m3, m4, m7, m5 lea r0, [r0+4*r1] lea r2, [r2+4*r3] paddw m1, m2 paddw m3, m4 paddw m0, m1 paddw m0, m3 %endmacro %macro SAD_INC_2x4P_MMX 0 movu m1, [r0] movu m2, [r0+2*r1] psubw m1, [r2] psubw m2, [r2+2*r3] ABSW2 m1, m2, m1, m2, m3, m4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] paddw m0, m1 paddw m0, m2 %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD_MMX 3 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) pxor m0, m0 %if %2 == 4 SAD_INC_%3x%1P_MMX SAD_INC_%3x%1P_MMX %else mov r4d, %2/%3 .loop: SAD_INC_%3x%1P_MMX dec r4d jg .loop %endif %if %1*%2 == 256 HADDUW m0, m1 %else HADDW m0, m1 %endif movd eax, m0 RET %endmacro INIT_MMX mmx2 SAD_MMX 16, 16, 1 SAD_MMX 16, 8, 1 SAD_MMX 8, 16, 2 SAD_MMX 8, 8, 2 SAD_MMX 8, 4, 2 SAD_MMX 4, 8, 2 SAD_MMX 4, 4, 2 INIT_MMX ssse3 SAD_MMX 4, 8, 2 SAD_MMX 4, 4, 2 ;============================================================================= ; SAD XMM ;============================================================================= %macro SAD_INC_2ROW 1 %if 2*%1 > mmsize movu m1, [r2+ 0] movu m2, [r2+16] movu m3, [r2+2*r3+ 0] movu m4, [r2+2*r3+16] psubw m1, [r0+ 0] psubw m2, [r0+16] psubw m3, [r0+2*r1+ 0] psubw m4, [r0+2*r1+16] ABSW2 m1, m2, m1, m2, m5, m6 lea r0, [r0+4*r1] lea r2, [r2+4*r3] ABSW2 m3, m4, m3, m4, m7, m5 paddw m1, m2 paddw m3, m4 paddw m0, m1 paddw m0, m3 %else movu m1, [r2] movu m2, [r2+2*r3] psubw m1, [r0] psubw m2, [r0+2*r1] ABSW2 m1, m2, m1, m2, m3, m4 lea r0, [r0+4*r1] lea r2, [r2+4*r3] paddw m0, m1 paddw m0, m2 %endif %endmacro ;----------------------------------------------------------------------------- ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD 2 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize) pxor m0, m0 %if %2 == 4 SAD_INC_2ROW %1 SAD_INC_2ROW %1 %else mov r4d, %2/2 .loop: SAD_INC_2ROW %1 dec r4d jg .loop %endif HADDW m0, m1 movd eax, xm0 RET %endmacro INIT_XMM sse2 SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 INIT_XMM sse2, aligned SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 INIT_XMM ssse3 SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 INIT_XMM ssse3, aligned SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 INIT_YMM avx2 SAD 16, 16 SAD 16, 8 ;============================================================================= ; SAD x3/x4 ;============================================================================= %macro SAD_X3_INC_P 0 add r0, 4*FENC_STRIDE lea r1, [r1+4*r4] lea r2, [r2+4*r4] lea r3, [r3+4*r4] %endmacro %macro SAD_X3_ONE_START 0 mova m3, [r0] movu m0, [r1] movu m1, [r2] movu m2, [r3] psubw m0, m3 psubw m1, m3 psubw m2, m3 ABSW2 m0, m1, m0, m1, m4, m5 ABSW m2, m2, m6 %endmacro %macro SAD_X3_ONE 2 mova m6, [r0+%1] movu m3, [r1+%2] movu m4, [r2+%2] movu m5, [r3+%2] psubw m3, m6 psubw m4, m6 psubw m5, m6 ABSW2 m3, m4, m3, m4, m7, m6 ABSW m5, m5, m6 paddw m0, m3 paddw m1, m4 paddw m2, m5 %endmacro %macro SAD_X3_END 2 %if mmsize == 8 && %1*%2 == 256 HADDUW m0, m3 HADDUW m1, m4 HADDUW m2, m5 %else HADDW m0, m3 HADDW m1, m4 HADDW m2, m5 %endif %if UNIX64 movd [r5+0], xm0 movd [r5+4], xm1 movd [r5+8], xm2 %else mov r0, r5mp movd [r0+0], xm0 movd [r0+4], xm1 movd [r0+8], xm2 %endif RET %endmacro %macro SAD_X4_INC_P 0 add r0, 4*FENC_STRIDE lea r1, [r1+4*r5] lea r2, [r2+4*r5] lea r3, [r3+4*r5] lea r4, [r4+4*r5] %endmacro %macro SAD_X4_ONE_START 0 mova m4, [r0] movu m0, [r1] movu m1, [r2] movu m2, [r3] movu m3, [r4] psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 ABSW2 m0, m1, m0, m1, m5, m6 ABSW2 m2, m3, m2, m3, m4, m7 %endmacro %macro SAD_X4_ONE 2 mova m4, [r0+%1] movu m5, [r1+%2] movu m6, [r2+%2] %if num_mmregs > 8 movu m7, [r3+%2] movu m8, [r4+%2] psubw m5, m4 psubw m6, m4 psubw m7, m4 psubw m8, m4 ABSW2 m5, m6, m5, m6, m9, m10 ABSW2 m7, m8, m7, m8, m9, m10 paddw m0, m5 paddw m1, m6 paddw m2, m7 paddw m3, m8 %elif cpuflag(ssse3) movu m7, [r3+%2] psubw m5, m4 psubw m6, m4 psubw m7, m4 movu m4, [r4+%2] pabsw m5, m5 psubw m4, [r0+%1] pabsw m6, m6 pabsw m7, m7 pabsw m4, m4 paddw m0, m5 paddw m1, m6 paddw m2, m7 paddw m3, m4 %else ; num_mmregs == 8 && !ssse3 psubw m5, m4 psubw m6, m4 ABSW m5, m5, m7 ABSW m6, m6, m7 paddw m0, m5 paddw m1, m6 movu m5, [r3+%2] movu m6, [r4+%2] psubw m5, m4 psubw m6, m4 ABSW2 m5, m6, m5, m6, m7, m4 paddw m2, m5 paddw m3, m6 %endif %endmacro %macro SAD_X4_END 2 %if mmsize == 8 && %1*%2 == 256 HADDUW m0, m4 HADDUW m1, m5 HADDUW m2, m6 HADDUW m3, m7 %else HADDW m0, m4 HADDW m1, m5 HADDW m2, m6 HADDW m3, m7 %endif mov r0, r6mp movd [r0+ 0], xm0 movd [r0+ 4], xm1 movd [r0+ 8], xm2 movd [r0+12], xm3 RET %endmacro %macro SAD_X_2xNP 4 %assign x %3 %rep %4 SAD_X%1_ONE x*mmsize, x*mmsize SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize %assign x x+1 %endrep %endmacro %macro PIXEL_VSAD 0 cglobal pixel_vsad, 3,3,8 mova m0, [r0] mova m1, [r0+16] mova m2, [r0+2*r1] mova m3, [r0+2*r1+16] lea r0, [r0+4*r1] psubw m0, m2 psubw m1, m3 ABSW2 m0, m1, m0, m1, m4, m5 paddw m0, m1 sub r2d, 2 je .end .loop: mova m4, [r0] mova m5, [r0+16] mova m6, [r0+2*r1] mova m7, [r0+2*r1+16] lea r0, [r0+4*r1] psubw m2, m4 psubw m3, m5 psubw m4, m6 psubw m5, m7 ABSW m2, m2, m1 ABSW m3, m3, m1 ABSW m4, m4, m1 ABSW m5, m5, m1 paddw m0, m2 paddw m0, m3 paddw m0, m4 paddw m0, m5 mova m2, m6 mova m3, m7 sub r2d, 2 jg .loop .end: %if BIT_DEPTH == 9 HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682 %else HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426 %endif movd eax, m0 RET %endmacro INIT_XMM sse2 PIXEL_VSAD INIT_XMM ssse3 PIXEL_VSAD INIT_XMM xop PIXEL_VSAD INIT_YMM avx2 cglobal pixel_vsad, 3,3 mova m0, [r0] mova m1, [r0+2*r1] lea r0, [r0+4*r1] psubw m0, m1 pabsw m0, m0 sub r2d, 2 je .end .loop: mova m2, [r0] mova m3, [r0+2*r1] lea r0, [r0+4*r1] psubw m1, m2 psubw m2, m3 pabsw m1, m1 pabsw m2, m2 paddw m0, m1 paddw m0, m2 mova m1, m3 sub r2d, 2 jg .loop .end: %if BIT_DEPTH == 9 HADDW m0, m1 %else HADDUW m0, m1 %endif movd eax, xm0 RET ;----------------------------------------------------------------------------- ; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, ; uint16_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS %assign regnum %1+1 %xdefine STRIDE r %+ regnum mov r6, %3/2-1 SAD_X%1_ONE_START SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1 .loop: SAD_X%1_INC_P SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2) dec r6 jg .loop %if %1 == 4 mov r6, r6m %endif SAD_X%1_END %2, %3 %endmacro INIT_MMX mmx2 %define XMM_REGS 0 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 SAD_X 3, 4, 8 SAD_X 3, 4, 4 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 SAD_X 4, 4, 8 SAD_X 4, 4, 4 INIT_MMX ssse3 SAD_X 3, 4, 8 SAD_X 3, 4, 4 SAD_X 4, 4, 8 SAD_X 4, 4, 4 INIT_XMM ssse3 %define XMM_REGS 7 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 %define XMM_REGS 9 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 INIT_XMM sse2 %define XMM_REGS 8 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 %define XMM_REGS 11 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 INIT_XMM xop %define XMM_REGS 7 SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 %define XMM_REGS 9 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 INIT_YMM avx2 %define XMM_REGS 7 SAD_X 3, 16, 16 SAD_X 3, 16, 8 %define XMM_REGS 9 SAD_X 4, 16, 16 SAD_X 4, 16, 8 ;----------------------------------------------------------------------------- ; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] ); ;----------------------------------------------------------------------------- %macro INTRA_SAD_X3_4x4 0 cglobal intra_sad_x3_4x4, 3,3,7 %if cpuflag(ssse3) movddup m0, [r1-1*FDEC_STRIDEB] %else movq m0, [r1-1*FDEC_STRIDEB] punpcklqdq m0, m0 %endif movq m1, [r0+0*FENC_STRIDEB] movq m2, [r0+2*FENC_STRIDEB] pshuflw m6, m0, q1032 paddw m6, m0 pshuflw m5, m6, q2301 paddw m6, m5 punpcklqdq m6, m6 ; A+B+C+D 8 times movhps m1, [r0+1*FENC_STRIDEB] movhps m2, [r0+3*FENC_STRIDEB] psubw m3, m1, m0 psubw m0, m2 ABSW2 m3, m0, m3, m0, m4, m5 paddw m0, m3 movd m3, [r1+0*FDEC_STRIDEB-4] movd m4, [r1+2*FDEC_STRIDEB-4] movhps m3, [r1+1*FDEC_STRIDEB-8] movhps m4, [r1+3*FDEC_STRIDEB-8] pshufhw m3, m3, q3333 pshufhw m4, m4, q3333 pshuflw m3, m3, q1111 ; FF FF EE EE pshuflw m4, m4, q1111 ; HH HH GG GG paddw m5, m3, m4 paddw m6, [pw_4] paddw m6, m5 pshufd m5, m5, q1032 paddw m5, m6 psrlw m5, 3 psubw m6, m5, m2 psubw m5, m1 psubw m1, m3 psubw m2, m4 ABSW2 m5, m6, m5, m6, m3, m4 ABSW2 m1, m2, m1, m2, m3, m4 paddw m5, m6 paddw m1, m2 %if cpuflag(ssse3) phaddw m0, m1 movhlps m3, m5 paddw m5, m3 phaddw m0, m5 pmaddwd m0, [pw_1] mova [r2], m0 %else HADDW m0, m3 HADDW m1, m3 HADDW m5, m3 movd [r2], m0 ; V prediction cost movd [r2+4], m1 ; H prediction cost movd [r2+8], m5 ; DC prediction cost %endif RET %endmacro INIT_XMM sse2 INTRA_SAD_X3_4x4 INIT_XMM ssse3 INTRA_SAD_X3_4x4 INIT_XMM avx INTRA_SAD_X3_4x4 ;----------------------------------------------------------------------------- ; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] ); ;----------------------------------------------------------------------------- ;m0 = DC ;m6 = V ;m7 = H ;m1 = DC score ;m2 = V score ;m3 = H score ;m5 = temp ;m4 = pixel row %macro INTRA_SAD_HVDC_ITER 2 mova m4, [r0+(%1-4)*FENC_STRIDEB] psubw m4, m0 ABSW m4, m4, m5 ACCUM paddw, 1, 4, %1 mova m4, [r0+(%1-4)*FENC_STRIDEB] psubw m4, m6 ABSW m4, m4, m5 ACCUM paddw, 2, 4, %1 pshufd m5, m7, %2 psubw m5, [r0+(%1-4)*FENC_STRIDEB] ABSW m5, m5, m4 ACCUM paddw, 3, 5, %1 %endmacro %macro INTRA_SAD_X3_8x8 0 cglobal intra_sad_x3_8x8, 3,3,8 add r0, 4*FENC_STRIDEB movu m0, [r1+7*SIZEOF_PIXEL] mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction mova m7, m0 paddw m0, m6 punpckhwd m7, m7 HADDW m0, m4 paddw m0, [pw_8] psrlw m0, 4 SPLATW m0, m0 INTRA_SAD_HVDC_ITER 0, q3333 INTRA_SAD_HVDC_ITER 1, q2222 INTRA_SAD_HVDC_ITER 2, q1111 INTRA_SAD_HVDC_ITER 3, q0000 movq m7, [r1+7*SIZEOF_PIXEL] punpcklwd m7, m7 INTRA_SAD_HVDC_ITER 4, q3333 INTRA_SAD_HVDC_ITER 5, q2222 INTRA_SAD_HVDC_ITER 6, q1111 INTRA_SAD_HVDC_ITER 7, q0000 %if cpuflag(ssse3) phaddw m2, m3 ; 2 2 2 2 3 3 3 3 movhlps m3, m1 paddw m1, m3 ; 1 1 1 1 _ _ _ _ phaddw m2, m1 ; 2 2 3 3 1 1 _ _ pmaddwd m2, [pw_1] ; 2 3 1 _ mova [r2], m2 %else HADDW m2, m4 HADDW m3, m4 HADDW m1, m4 movd [r2+0], m2 movd [r2+4], m3 movd [r2+8], m1 %endif RET %endmacro INIT_XMM sse2 INTRA_SAD_X3_8x8 INIT_XMM ssse3 INTRA_SAD_X3_8x8 %macro INTRA_SAD_HVDC_ITER_YMM 2 mova xm4, [r0+(%1-4)*FENC_STRIDEB] vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1 pshufd m5, m7, %2 psubw m5, m4 pabsw m5, m5 ACCUM paddw, 2, 5, %1 ; H psubw m5, m4, m6 psubw m4, m0 pabsw m5, m5 pabsw m4, m4 ACCUM paddw, 1, 5, %1 ; V ACCUM paddw, 3, 4, %1 ; DC %endmacro INIT_YMM avx2 cglobal intra_sad_x3_8x8, 3,3,8 add r0, 4*FENC_STRIDEB movu xm0, [r1+7*SIZEOF_PIXEL] vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction vpermq m7, m0, q0011 paddw xm0, xm6 paddw xm0, [pw_1] ; equal to +8 after HADDW HADDW xm0, xm4 psrld xm0, 4 vpbroadcastw m0, xm0 punpcklwd m7, m7 INTRA_SAD_HVDC_ITER_YMM 0, q3333 INTRA_SAD_HVDC_ITER_YMM 1, q2222 INTRA_SAD_HVDC_ITER_YMM 2, q1111 INTRA_SAD_HVDC_ITER_YMM 3, q0000 phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 punpckhqdq m2, m3, m3 paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _ phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _ vextracti128 xm2, m1, 1 paddw xm1, xm2 ; 1 1 2 2 3 3 _ _ pmaddwd xm1, [pw_1] ; 1 2 3 _ mova [r2], xm1 RET x264-master/common/x86/trellis-64.asm000066400000000000000000000622441502133446700174130ustar00rootroot00000000000000;***************************************************************************** ;* trellis-64.asm: x86_64 trellis quantization ;***************************************************************************** ;* Copyright (C) 2012-2025 x264 project ;* ;* Authors: Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** ; This is a pretty straight-forward translation of the C code, except: ; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level. ; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop: ; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those ; nodes are invalid). ; * Interprocedural register allocation. Eliminates argument-passing overhead ; to trellis_coef* subroutines. Also reduces codesize. ; Optimizations that I tried, and rejected because they were not faster: ; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3]. ; Costs too much icache compared to the negligible speedup. ; * There are only 21 possible sets of live node_ctxs; we could keep track of ; exactly which set we're in and feed that (along with abs_level) into a jump ; table instead of the switch to select a trellis_coef subroutine. This would ; eliminate all branches about which node_ctxs are live, but costs either a ; bunch of icache or a bunch of call/ret, and the jump table itself is ; unpredictable. ; * Separate versions of trellis_coef* depending on whether we're doing the 1st ; or the 2nd of the two abs_level candidates. This would eliminate some ; branches about if(score is better). ; * Special case more values of coef. I had a coef2 at some intermediate point ; in the optimization process, but it didn't end up worthwhile in conjunction ; with all the other optimizations. ; * Unroll or simd writeback. I don't know why this didn't help. %include "x86inc.asm" %include "x86util.asm" SECTION_RODATA pd_m16: times 4 dd -16 sq_1: dq 1, 0 pq_128: times 2 dq 128 pq_ffffffff: times 2 dq 0xffffffff cextern pd_8 cextern pd_0123 cextern pd_4567 cextern_common cabac_entropy cextern_common cabac_transition cextern cabac_size_unary cextern cabac_transition_unary cextern_common dct4_weight_tab cextern_common dct8_weight_tab cextern_common dct4_weight2_tab cextern_common dct8_weight2_tab cextern_common last_coeff_flag_offset_8x8 cextern_common significant_coeff_flag_offset_8x8 cextern_common coeff_flag_offset_chroma_422_dc SECTION .text %define TRELLIS_SCORE_BIAS 1<<60 %define SIZEOF_NODE 16 %define CABAC_SIZE_BITS 8 %define LAMBDA_BITS 4 %macro SQUARE 2 ; dst, tmp ; could use pmuldq here, to eliminate the abs. but that would involve ; templating a sse4 version of all of trellis, for negligible speedup. %if cpuflag(ssse3) pabsd m%1, m%1 pmuludq m%1, m%1 %elif HIGH_BIT_DEPTH ABSD m%2, m%1 SWAP %1, %2 pmuludq m%1, m%1 %else pmuludq m%1, m%1 pand m%1, [pq_ffffffff] %endif %endmacro %macro LOAD_DUP 2 ; dst, src %if cpuflag(ssse3) movddup %1, %2 %else movd %1, %2 punpcklqdq %1, %1 %endif %endmacro ;----------------------------------------------------------------------------- ; int trellis_cabac_4x4_psy( ; const int *unquant_mf, const uint8_t *zigzag, int lambda2, ; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct, ; uint8_t *cabac_state_sig, uint8_t *cabac_state_last, ; uint64_t level_state0, uint16_t level_state1, ; int b_ac, dctcoef *fenc_dct, int psy_trellis ) ;----------------------------------------------------------------------------- %macro TRELLIS 4 %define num_coefs %2 %define dc %3 %define psy %4 cglobal %1, 4,15,9 %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef* %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15) SUB rsp, pad DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last %if WIN64 %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array %else %define level_statem rsp+stack_offset+32 %endif %define b_acm r11m ; 4x4 only %define b_interlacedm r11m ; 8x8 only %define i_coefsm1 r11m ; dc only %define fenc_dctm r12m %define psy_trellism r13m %if num_coefs == 64 shl dword b_interlacedm, 6 %define dct_weight1_tab dct8_weight_tab %define dct_weight2_tab dct8_weight2_tab %else %define dct_weight1_tab dct4_weight_tab %define dct_weight2_tab dct4_weight2_tab %endif %define stack rsp %define last_nnzm [stack+0] %define zigzagm [stack+8] mov last_nnzm, iid mov zigzagm, zigzagq %if WIN64 == 0 %define orig_coefsm [stack+16] %define quant_coefsm [stack+24] mov orig_coefsm, orig_coefsq mov quant_coefsm, quant_coefsq %endif %define unquant_mfm [stack+32] %define levelgt1_ctxm [stack+40] %define ssd stack+48 %define cost_siglast stack+80 %define level_tree stack+96 ; trellis_node_t is laid out differently than C. ; struct-of-arrays rather than array-of-structs, for simd. %define nodes_curq r7 %define nodes_prevq r8 %define node_score(x) x*8 %define node_level_idx(x) 64+x*4 %define node_cabac_state(x) 96+x*4 lea nodes_curq, [level_tree + level_tree_size] lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE] mov r6, TRELLIS_SCORE_BIAS mov [nodes_curq + node_score(0)], r6 mov dword [nodes_curq + node_level_idx(0)], 0 movd mm0, [level_statem + 0] punpcklbw mm0, [level_statem + 4] punpcklwd mm0, [level_statem + 8] %define level_state_packed mm0 ; version for copying into node.cabac_state pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX movq [nodes_curq + node_score(1)], m7 mova [nodes_curq + node_score(2)], m7 %define levels_usedq r4 %define levels_usedd r4d mov dword [level_tree], 0 mov levels_usedd, 1 %define abs_levelq r9 %define abs_leveld r9d %define abs_coefq r14 %define zigzagiq r5 %define zigzagid r5d %if num_coefs == 8 mov dword levelgt1_ctxm, 8 %else mov dword levelgt1_ctxm, 9 %endif %if psy LOAD_DUP m6, psy_trellism %define psy_trellis m6 %elif dc LOAD_DUP m6, [unquant_mfq] paddd m6, m6 %define unquant_mf m6 %endif %if dc == 0 mov unquant_mfm, unquant_mfq %endif ; Keep a single offset register to PICify all global constants. ; They're all relative to "beginning of this asm file's .text section", ; even tables that aren't in this file. ; (Any address in .text would work, this one was just convenient.) lea r0, [$$] %define GLOBAL +r0-$$ TRELLIS_LOOP 0 ; node_ctx 0..3 TRELLIS_LOOP 1 ; node_ctx 1..7 .writeback: ; int level = bnode->level_idx; ; for( int i = b_ac; i <= last_nnz; i++ ) ; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]); ; level = level_tree[level].next; mov iid, last_nnzm add zigzagq, iiq neg iiq %if num_coefs == 16 && dc == 0 mov r2d, b_acm add iiq, r2 %endif %define dctq r10 mov r0d, [nodes_curq + node_level_idx(0) + rax*4] .writeback_loop: movzx r2, byte [zigzagq + iiq] %if cpuflag(ssse3) movd m0, [level_tree + r0*4] movzx r0, word [level_tree + r0*4] psrld m0, 16 movd m1, [dctq + r2*SIZEOF_DCTCOEF] %if HIGH_BIT_DEPTH psignd m0, m1 movd [dctq + r2*SIZEOF_DCTCOEF], m0 %else psignw m0, m1 movd r4d, m0 mov [dctq + r2*SIZEOF_DCTCOEF], r4w %endif %else mov r5d, [level_tree + r0*4] %if HIGH_BIT_DEPTH mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF] %else movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF] %endif movzx r0d, r5w sar r4d, 31 shr r5d, 16 xor r5d, r4d sub r5d, r4d %if HIGH_BIT_DEPTH mov [dctq + r2*SIZEOF_DCTCOEF], r5d %else mov [dctq + r2*SIZEOF_DCTCOEF], r5w %endif %endif inc iiq jle .writeback_loop mov eax, 1 .return: ADD rsp, pad RET %if num_coefs == 16 && dc == 0 .return_zero: pxor m0, m0 mova [r10+ 0], m0 mova [r10+16], m0 %if HIGH_BIT_DEPTH mova [r10+32], m0 mova [r10+48], m0 %endif jmp .return %endif %endmacro ; TRELLIS %macro TRELLIS_LOOP 1 ; ctx_hi .i_loop%1: ; if( !quant_coefs[i] ) mov r6, quant_coefsm %if HIGH_BIT_DEPTH mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF] %else movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF] %endif ; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; mov r10, cabac_state_sigm %if num_coefs == 64 mov r6d, b_interlacedm add r6d, iid movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL] movzx r10, byte [r10 + r6] %elif num_coefs == 8 movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL] movzx r10, byte [r10 + r13] %else movzx r10, byte [r10 + iiq] %endif test abs_leveld, abs_leveld jnz %%.nonzero_quant_coef %if %1 == 0 ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ) ; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); ; nodes_cur[0].score -= cost_sig0; movzx r10, word [cabac_entropy + r10*2 GLOBAL] imul r10, lambda2q shr r10, CABAC_SIZE_BITS - LAMBDA_BITS sub [nodes_curq + node_score(0)], r10 %endif ZERO_LEVEL_IDX %1, cur jmp .i_continue%1 %%.nonzero_quant_coef: ; int sign_coef = orig_coefs[zigzag[i]]; ; int abs_coef = abs( sign_coef ); ; int q = abs( quant_coefs[i] ); movzx zigzagid, byte [zigzagq+iiq] movd m0, abs_leveld mov r6, orig_coefsm %if HIGH_BIT_DEPTH LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF] %else LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] psrad m1, 16 ; sign_coef %endif punpcklqdq m0, m0 ; quant_coef %if cpuflag(ssse3) pabsd m0, m0 pabsd m2, m1 ; abs_coef %else pxor m8, m8 pcmpgtd m8, m1 ; sign_mask pxor m0, m8 pxor m2, m1, m8 psubd m0, m8 psubd m2, m8 %endif psubd m0, [sq_1] ; abs_level movd abs_leveld, m0 xchg nodes_curq, nodes_prevq ; if( i < num_coefs-1 ) ; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i; ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i ; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ); ; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 ); ; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1; ; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1; %if %1 == 0 %if dc && num_coefs != 8 cmp iid, i_coefsm1 %else cmp iid, num_coefs-1 %endif je %%.zero_siglast %endif movzx r11, word [cabac_entropy + r10*2 GLOBAL] xor r10, 1 movzx r12, word [cabac_entropy + r10*2 GLOBAL] mov [cost_siglast+0], r11d mov r10, cabac_state_lastm %if num_coefs == 64 movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL] movzx r10, byte [r10 + r6] %elif num_coefs == 8 movzx r10, byte [r10 + r13] %else movzx r10, byte [r10 + iiq] %endif movzx r11, word [cabac_entropy + r10*2 GLOBAL] add r11, r12 mov [cost_siglast+4], r11d %if %1 == 0 xor r10, 1 movzx r10, word [cabac_entropy + r10*2 GLOBAL] add r10, r12 mov [cost_siglast+8], r10d %endif %%.skip_siglast: ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8); ; int d = abs_coef - unquant_abs_level; ; uint64_t ssd = (int64_t)d*d * coef_weight[i]; %if dc pmuludq m0, unquant_mf %else mov r10, unquant_mfm LOAD_DUP m3, [r10 + zigzagiq*4] pmuludq m0, m3 %endif paddd m0, [pq_128] psrld m0, 8 ; unquant_abs_level %if psy || dc == 0 mova m4, m0 %endif psubd m0, m2 SQUARE 0, 3 %if dc psllq m0, 8 %else LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL] pmuludq m0, m5 %endif %if psy test iid, iid jz %%.dc_rounding ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef)); ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis; ; ssd1[k] -= psy_weight * psy_value; mov r6, fenc_dctm %if HIGH_BIT_DEPTH LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF] %else LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2] psrad m3, 16 ; orig_coef %endif %if cpuflag(ssse3) psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef) %else PSIGN d, m4, m8 %endif psubd m3, m1 ; predicted_coef paddd m4, m3 %if cpuflag(ssse3) pabsd m4, m4 %else ABSD m3, m4 SWAP 4, 3 %endif LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL] pmuludq m1, psy_trellis pmuludq m4, m1 psubq m0, m4 %if %1 %%.dc_rounding: %endif %endif %if %1 == 0 mova [ssd], m0 %endif %if dc == 0 && %1 == 0 test iid, iid jnz %%.skip_dc_rounding %%.dc_rounding: ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15); ; uint64_t ssd = (int64_t)d*d * coef_weight[i]; psrad m1, 31 ; sign_coef>>31 paddd m4, [pd_8] paddd m4, m1 pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15 psubd m4, m2 ; d SQUARE 4, 3 pmuludq m4, m5 mova [ssd], m4 %%.skip_dc_rounding: %endif mova [ssd+16], m0 %assign stack_offset_bak stack_offset cmp abs_leveld, 1 jl %%.switch_coef0 %if %1 == 0 mov r10, [ssd] ; trellis_coef* args %endif movq r12, m0 ; for( int j = 0; j < 8; j++ ) ; nodes_cur[j].score = TRELLIS_SCORE_MAX; %if cpuflag(ssse3) mova [nodes_curq + node_score(0)], m7 mova [nodes_curq + node_score(2)], m7 %else ; avoid store-forwarding stalls on k8/k10 %if %1 == 0 movq [nodes_curq + node_score(0)], m7 %endif movq [nodes_curq + node_score(1)], m7 movq [nodes_curq + node_score(2)], m7 movq [nodes_curq + node_score(3)], m7 %endif mova [nodes_curq + node_score(4)], m7 mova [nodes_curq + node_score(6)], m7 je %%.switch_coef1 %%.switch_coefn: call trellis_coefn.entry%1 call trellis_coefn.entry%1b jmp .i_continue1 %%.switch_coef1: call trellis_coef1.entry%1 call trellis_coefn.entry%1b jmp .i_continue1 %%.switch_coef0: call trellis_coef0_%1 call trellis_coef1.entry%1b .i_continue%1: dec iid %if num_coefs == 16 && dc == 0 cmp iid, b_acm %endif jge .i_loop%1 call trellis_bnode_%1 %if %1 == 0 %if num_coefs == 16 && dc == 0 jz .return_zero %else jz .return %endif jmp .writeback %%.zero_siglast: xor r6d, r6d mov [cost_siglast+0], r6 mov [cost_siglast+8], r6d jmp %%.skip_siglast %endif %endmacro ; TRELLIS_LOOP ; just a synonym for %if %macro IF0 1+ %endmacro %macro IF1 1+ %1 %endmacro %macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev ; for( int j = 0; j < 8; j++ ) ; nodes_cur[j].level_idx = levels_used; ; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 }; ; levels_used++; add levels_usedd, 3 and levels_usedd, ~3 ; allow aligned stores movd m0, levels_usedd pshufd m0, m0, 0 IF%1 mova m1, m0 paddd m0, [pd_0123] IF%1 paddd m1, [pd_4567] mova m2, [nodes_%2q + node_level_idx(0)] IF%1 mova m3, [nodes_%2q + node_level_idx(4)] mova [nodes_curq + node_level_idx(0)], m0 IF%1 mova [nodes_curq + node_level_idx(4)], m1 mova [level_tree + (levels_usedq+0)*4], m2 IF%1 mova [level_tree + (levels_usedq+4)*4], m3 add levels_usedd, (1+%1)*4 %endmacro INIT_XMM sse2 TRELLIS trellis_cabac_4x4, 16, 0, 0 TRELLIS trellis_cabac_8x8, 64, 0, 0 TRELLIS trellis_cabac_4x4_psy, 16, 0, 1 TRELLIS trellis_cabac_8x8_psy, 64, 0, 1 TRELLIS trellis_cabac_dc, 16, 1, 0 TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0 INIT_XMM ssse3 TRELLIS trellis_cabac_4x4, 16, 0, 0 TRELLIS trellis_cabac_8x8, 64, 0, 0 TRELLIS trellis_cabac_4x4_psy, 16, 0, 1 TRELLIS trellis_cabac_8x8_psy, 64, 0, 1 TRELLIS trellis_cabac_dc, 16, 1, 0 TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0 %define stack rsp+gprsize %define scoreq r14 %define bitsq r13 %define bitsd r13d INIT_XMM %macro clocal 1 ALIGN 16 global mangle(private_prefix %+ _%1) mangle(private_prefix %+ _%1): %1: %assign stack_offset stack_offset_bak+gprsize %endmacro %macro TRELLIS_BNODE 1 ; ctx_hi clocal trellis_bnode_%1 ; int j = ctx_hi?1:0; ; trellis_node_t *bnode = &nodes_cur[j]; ; while( ++j < (ctx_hi?8:4) ) ; if( nodes_cur[j].score < bnode->score ) ; bnode = &nodes_cur[j]; %assign j %1 mov rax, [nodes_curq + node_score(j)] lea rax, [rax*8 + j] %rep 3+3*%1 %assign j j+1 mov r11, [nodes_curq + node_score(j)] lea r11, [r11*8 + j] cmp rax, r11 cmova rax, r11 %endrep mov r10, dctm and eax, 7 ret %endmacro ; TRELLIS_BNODE TRELLIS_BNODE 0 TRELLIS_BNODE 1 %macro TRELLIS_COEF0 1 ; ctx_hi clocal trellis_coef0_%1 ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); mov r11d, [cost_siglast+0] imul r11, lambda2q shr r11, CABAC_SIZE_BITS - LAMBDA_BITS add r11, [ssd+16] %if %1 == 0 ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1; mov scoreq, [nodes_prevq + node_score(0)] add scoreq, [ssd] sub scoreq, r11 mov [nodes_curq + node_score(0)], scoreq %endif ; memcpy mov scoreq, [nodes_prevq + node_score(1)] mov [nodes_curq + node_score(1)], scoreq mova m1, [nodes_prevq + node_score(2)] mova [nodes_curq + node_score(2)], m1 %if %1 mova m1, [nodes_prevq + node_score(4)] mova [nodes_curq + node_score(4)], m1 mova m1, [nodes_prevq + node_score(6)] mova [nodes_curq + node_score(6)], m1 %endif mov r6d, [nodes_prevq + node_cabac_state(3)] mov [nodes_curq + node_cabac_state(3)], r6d %if %1 mova m1, [nodes_prevq + node_cabac_state(4)] mova [nodes_curq + node_cabac_state(4)], m1 %endif ZERO_LEVEL_IDX %1, prev ret %endmacro ; TRELLIS_COEF0 TRELLIS_COEF0 0 TRELLIS_COEF0 1 %macro START_COEF 1 ; gt1 ; if( (int64_t)nodes_prev[0].score < 0 ) continue; mov scoreq, [nodes_prevq + node_score(j)] %if j > 0 test scoreq, scoreq js .ctx %+ nextj_if_invalid %endif ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 ); %if j >= 3 movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4 movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL] %else movzx r6d, byte [level_statem + coeff_abs_level1_offs] %endif %if %1 xor r6d, 1 %endif movzx bitsd, word [cabac_entropy + r6*2 GLOBAL] ; n.score += ssd; ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ]; %if j == 0 add scoreq, r10 add bitsd, [cost_siglast+8] %else add scoreq, r12 add bitsd, [cost_siglast+4] %endif %endmacro ; START_COEF %macro END_COEF 1 ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); imul bitsq, lambda2q shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS add scoreq, bitsq ; if( n.score < nodes_cur[node_ctx].score ) ; SET_LEVEL( n, abs_level ); ; nodes_cur[node_ctx] = n; cmp scoreq, [nodes_curq + node_score(node_ctx)] jae .ctx %+ nextj_if_valid mov [nodes_curq + node_score(node_ctx)], scoreq %if j == 2 || (j <= 3 && node_ctx == 4) ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed %elif j >= 3 ; if we have updated before, then copy cabac_state from the parent node mov r6d, [nodes_prevq + node_cabac_state(j)] mov [nodes_curq + node_cabac_state(node_ctx)], r6d %endif %if j >= 3 ; skip the transition if we're not going to reuse the context mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2 %endif %if %1 && node_ctx == 7 mov r6d, levelgt1_ctxm mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b %endif mov r6d, [nodes_prevq + node_level_idx(j)] %if %1 mov r11d, abs_leveld shl r11d, 16 or r6d, r11d %else or r6d, 1<<16 %endif mov [level_tree + levels_usedq*4], r6d mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd inc levels_usedd %endmacro ; END_COEF %macro COEF1 2 %assign j %1 %assign nextj_if_valid %1+1 %assign nextj_if_invalid %2 %if j < 4 %assign coeff_abs_level1_offs j+1 %else %assign coeff_abs_level1_offs 0 %endif %if j < 3 %assign node_ctx j+1 %else %assign node_ctx j %endif .ctx %+ j: START_COEF 0 add bitsd, 1 << CABAC_SIZE_BITS END_COEF 0 %endmacro ; COEF1 %macro COEFN 2 %assign j %1 %assign nextj_if_valid %2 %assign nextj_if_invalid %2 %if j < 4 %assign coeff_abs_level1_offs j+1 %assign coeff_abs_levelgt1_offs 5 %else %assign coeff_abs_level1_offs 0 %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc %endif %if j < 4 %assign node_ctx 4 %elif j < 7 %assign node_ctx j+1 %else %assign node_ctx 7 %endif .ctx %+ j: START_COEF 1 ; if( abs_level >= 15 ) ; bits += bs_size_ue_big(...) add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX ; n.cabac_state[levelgt1_ctx] %if j == 7 ; && compiling support for 4:2:2 mov r6d, levelgt1_ctxm %define coeff_abs_levelgt1_offs r6 %endif %if j == 7 movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9 %else movzx r10, byte [level_statem + coeff_abs_levelgt1_offs] %endif ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]]; add r10d, r1d movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL] add bitsd, r6d %if node_ctx == 7 movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL] %endif END_COEF 1 %endmacro ; COEFN clocal trellis_coef1 .entry0b: ; ctx_lo, larger of the two abs_level candidates mov r10, [ssd+8] sub r10, r11 mov r12, [ssd+24] sub r12, r11 .entry0: ; ctx_lo, smaller of the two abs_level candidates COEF1 0, 4 COEF1 1, 4 COEF1 2, 4 COEF1 3, 4 .ctx4: rep ret .entry1b: ; ctx_hi, larger of the two abs_level candidates mov r12, [ssd+24] sub r12, r11 .entry1: ; ctx_hi, smaller of the two abs_level candidates trellis_coef1_hi: COEF1 1, 2 COEF1 2, 3 COEF1 3, 4 COEF1 4, 5 COEF1 5, 6 COEF1 6, 7 COEF1 7, 8 .ctx8: rep ret %macro COEFN_PREFIX 1 ; int prefix = X264_MIN( abs_level - 1, 14 ); mov r1d, abs_leveld cmp abs_leveld, 15 jge .level_suffix%1 xor r5d, r5d .skip_level_suffix%1: shl r1d, 7 %endmacro %macro COEFN_SUFFIX 1 .level_suffix%1: ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS; lea r5d, [abs_levelq-14] bsr r5d, r5d shl r5d, CABAC_SIZE_BITS+1 add r5d, 1< * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_X86_UTIL_H #define X264_X86_UTIL_H #ifdef __SSE__ #include #undef M128_ZERO #define M128_ZERO ((__m128){0,0,0,0}) #define x264_union128_t x264_union128_sse_t typedef union { __m128 i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_sse_t; #if HAVE_VECTOREXT typedef uint32_t v4si __attribute__((vector_size (16))); #endif #endif // __SSE__ #if HAVE_X86_INLINE_ASM && HAVE_MMX #define x264_median_mv x264_median_mv_mmx2 static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c ) { asm( "movd %1, %%mm0 \n" "movd %2, %%mm1 \n" "movq %%mm0, %%mm3 \n" "movd %3, %%mm2 \n" "pmaxsw %%mm1, %%mm0 \n" "pminsw %%mm3, %%mm1 \n" "pminsw %%mm2, %%mm0 \n" "pmaxsw %%mm1, %%mm0 \n" "movd %%mm0, %0 \n" :"=m"(*(x264_union32_t*)dst) :"m"(M32( a )), "m"(M32( b )), "m"(M32( c )) :"mm0", "mm1", "mm2", "mm3" ); } #define x264_predictor_difference x264_predictor_difference_mmx2 static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc ) { int sum; static const uint64_t pw_1 = 0x0001000100010001ULL; asm( "pxor %%mm4, %%mm4 \n" "test $1, %1 \n" "jnz 3f \n" "movd -8(%2,%1,4), %%mm0 \n" "movd -4(%2,%1,4), %%mm3 \n" "psubw %%mm3, %%mm0 \n" "jmp 2f \n" "3: \n" "dec %1 \n" "1: \n" "movq -8(%2,%1,4), %%mm0 \n" "psubw -4(%2,%1,4), %%mm0 \n" "2: \n" "sub $2, %1 \n" "pxor %%mm2, %%mm2 \n" "psubw %%mm0, %%mm2 \n" "pmaxsw %%mm2, %%mm0 \n" "paddusw %%mm0, %%mm4 \n" "jg 1b \n" "pmaddwd %4, %%mm4 \n" "pshufw $14, %%mm4, %%mm0 \n" "paddd %%mm0, %%mm4 \n" "movd %%mm4, %0 \n" :"=r"(sum), "+r"(i_mvc) :"r"(mvc), "m"(MEM_DYN( mvc, const int16_t )), "m"(pw_1) :"mm0", "mm2", "mm3", "mm4", "cc" ); return sum; } #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2 static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop) { static const uint64_t pb_2 = 0x0202020202020202ULL; static const uint64_t pb_32 = 0x2020202020202020ULL; static const uint64_t pb_33 = 0x2121212121212121ULL; int amvd; asm( "movd %1, %%mm0 \n" "movd %2, %%mm1 \n" "paddusb %%mm1, %%mm0 \n" "pminub %5, %%mm0 \n" "pxor %%mm2, %%mm2 \n" "movq %%mm0, %%mm1 \n" "pcmpgtb %3, %%mm0 \n" "pcmpgtb %4, %%mm1 \n" "psubb %%mm0, %%mm2 \n" "psubb %%mm1, %%mm2 \n" "movd %%mm2, %0 \n" :"=r"(amvd) :"m"(M16( mvdleft )),"m"(M16( mvdtop )), "m"(pb_2),"m"(pb_32),"m"(pb_33) :"mm0", "mm1", "mm2" ); return (uint16_t)amvd; } #define x264_predictor_clip x264_predictor_clip_mmx2 static ALWAYS_INLINE int x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { static const uint32_t pd_32 = 0x20; intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; asm( "movq (%2), %%mm5 \n" "movd %6, %%mm3 \n" "psllw $2, %%mm5 \n" // Convert to subpel "pshufw $0xEE, %%mm5, %%mm6 \n" "dec %k3 \n" "jz 2f \n" // if( i_mvc == 1 ) {do the last iteration} "punpckldq %%mm3, %%mm3 \n" "punpckldq %%mm5, %%mm5 \n" "movd %7, %%mm4 \n" "lea (%0,%3,4), %3 \n" "1: \n" "movq (%0), %%mm0 \n" "add $8, %0 \n" "movq %%mm3, %%mm1 \n" "pxor %%mm2, %%mm2 \n" "pcmpeqd %%mm0, %%mm1 \n" // mv == pmv "pcmpeqd %%mm0, %%mm2 \n" // mv == 0 "por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1 "pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf "pmaxsw %%mm5, %%mm0 \n" "pminsw %%mm6, %%mm0 \n" "pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32 "psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped "movq %%mm0, (%5,%4,4) \n" "and $24, %k2 \n" "add $2, %4 \n" "add $8, %k2 \n" "shr $4, %k2 \n" // (4-val)>>1 "sub %2, %4 \n" // +1 for each valid motion vector "cmp %3, %0 \n" "jl 1b \n" "jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration} /* Do the last iteration */ "2: \n" "movd (%0), %%mm0 \n" "pxor %%mm2, %%mm2 \n" "pcmpeqd %%mm0, %%mm3 \n" "pcmpeqd %%mm0, %%mm2 \n" "por %%mm3, %%mm2 \n" "pmovmskb %%mm2, %k2 \n" "pmaxsw %%mm5, %%mm0 \n" "pminsw %%mm6, %%mm0 \n" "movd %%mm0, (%5,%4,4) \n" "inc %4 \n" "and $1, %k2 \n" "sub %2, %4 \n" // output += !(mv == pmv || mv == 0) "3: \n" :"+r"(mvc), "=m"(MEM_DYN( dst, int16_t )), "+r"(tmp), "+r"(mvc_max), "+r"(i) :"r"(dst), "g"(pmv), "m"(pd_32), "m"(MEM_DYN( mvc, const int16_t )) :"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "cc" ); return i; } /* Same as the above, except we do (mv + 2) >> 2 on the input. */ #define x264_predictor_roundclip x264_predictor_roundclip_mmx2 static ALWAYS_INLINE int x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { static const uint64_t pw_2 = 0x0002000200020002ULL; static const uint32_t pd_32 = 0x20; intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; asm( "movq (%2), %%mm5 \n" "movq %6, %%mm7 \n" "movd %7, %%mm3 \n" "pshufw $0xEE, %%mm5, %%mm6 \n" "dec %k3 \n" "jz 2f \n" "punpckldq %%mm3, %%mm3 \n" "punpckldq %%mm5, %%mm5 \n" "movd %8, %%mm4 \n" "lea (%0,%3,4), %3 \n" "1: \n" "movq (%0), %%mm0 \n" "add $8, %0 \n" "paddw %%mm7, %%mm0 \n" "psraw $2, %%mm0 \n" "movq %%mm3, %%mm1 \n" "pxor %%mm2, %%mm2 \n" "pcmpeqd %%mm0, %%mm1 \n" "pcmpeqd %%mm0, %%mm2 \n" "por %%mm1, %%mm2 \n" "pmovmskb %%mm2, %k2 \n" "pmaxsw %%mm5, %%mm0 \n" "pminsw %%mm6, %%mm0 \n" "pand %%mm4, %%mm2 \n" "psrlq %%mm2, %%mm0 \n" "movq %%mm0, (%5,%4,4) \n" "and $24, %k2 \n" "add $2, %4 \n" "add $8, %k2 \n" "shr $4, %k2 \n" "sub %2, %4 \n" "cmp %3, %0 \n" "jl 1b \n" "jg 3f \n" /* Do the last iteration */ "2: \n" "movd (%0), %%mm0 \n" "paddw %%mm7, %%mm0 \n" "psraw $2, %%mm0 \n" "pxor %%mm2, %%mm2 \n" "pcmpeqd %%mm0, %%mm3 \n" "pcmpeqd %%mm0, %%mm2 \n" "por %%mm3, %%mm2 \n" "pmovmskb %%mm2, %k2 \n" "pmaxsw %%mm5, %%mm0 \n" "pminsw %%mm6, %%mm0 \n" "movd %%mm0, (%5,%4,4) \n" "inc %4 \n" "and $1, %k2 \n" "sub %2, %4 \n" "3: \n" :"+r"(mvc), "=m"(MEM_DYN( dst, int16_t )), "+r"(tmp), "+r"(mvc_max), "+r"(i) :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(MEM_DYN( mvc, const int16_t )) :"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "cc" ); return i; } #endif #endif x264-master/common/x86/x86inc.asm000066400000000000000000001660541502133446700166310ustar00rootroot00000000000000;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** ;* Copyright (C) 2005-2025 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner ;* Anton Mitrofanov ;* Fiona Glaser ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** ; This is a header file for the x264ASM assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used in x264. ; Unlike the rest of x264, this file is available under an ISC license, as it ; has significant usefulness outside of x264 and we want it to be available ; to the largest audience possible. Of course, if you modify it for your own ; purposes to add a new feature, we strongly encourage contributing a patch ; as this feature might be useful for others as well. Send patches or ideas ; to x264-devel@videolan.org . %ifndef private_prefix %define private_prefix x264 %endif %ifndef public_prefix %define public_prefix private_prefix %endif %ifndef STACK_ALIGNMENT %if ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 %endif %endif %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define WIN64 1 %else %define UNIX64 1 %endif %endif %define FORMAT_ELF 0 %define FORMAT_MACHO 0 %ifidn __OUTPUT_FORMAT__,elf %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf32 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf64 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,macho %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho32 %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho64 %define FORMAT_MACHO 1 %endif %ifdef PREFIX %define mangle(x) _ %+ x %else %define mangle(x) x %endif ; Use VEX-encoding even in non-AVX functions %ifndef FORCE_VEX_ENCODING %define FORCE_VEX_ENCODING 0 %endif %macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,win32 SECTION .rdata align=%1 %elif WIN64 SECTION .rdata align=%1 %else SECTION .rodata align=%1 %endif %endmacro %if ARCH_X86_64 %define PIC 1 ; always use PIC on x86-64 default rel %elifidn __OUTPUT_FORMAT__,win32 %define PIC 0 ; PIC isn't used on 32-bit Windows %elifndef PIC %define PIC 0 %endif %define HAVE_PRIVATE_EXTERN 1 %ifdef __NASM_VERSION_ID__ %use smartalign %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 %define HAVE_PRIVATE_EXTERN 0 %endif %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most of x264's asm. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro. ; RET: ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: ; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size %macro DECLARE_REG 2-3 %define r%1q %2 %define r%1d %2d %define r%1w %2w %define r%1b %2b %define r%1h %2h %define %2q %2 %if %0 == 2 %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else %define r%1m [rstk + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro %macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 %define r%1h %3 %define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE bx, bl, bh DECLARE_REG_SIZE cx, cl, ch DECLARE_REG_SIZE dx, dl, dh DECLARE_REG_SIZE si, sil, null DECLARE_REG_SIZE di, dil, null DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments %macro DECLARE_REG_TMP 1-* %assign %%i 0 %rep %0 CAT_XDEFINE t, %%i, r%1 %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro DECLARE_REG_TMP_SIZE 0-* %rep %0 %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w %define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep %endmacro DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif %macro LEA 2 %if ARCH_X86_64 lea %1, [%2] %elif PIC call $+5 ; special-cased to not affect the RSB on most CPU:s pop %1 add %1, -$+1+%2 %else mov %1, %2 %endif %endmacro ; Repeats an instruction/operation for multiple arguments. ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" %macro REPX 2-* ; operation, args %xdefine %%f(x) %1 %rep %0 - 1 %rotate 1 %%f(%1) %endrep %endmacro %macro PUSH 1 push %1 %ifidn rstk, rsp %assign stack_offset stack_offset+gprsize %endif %endmacro %macro POP 1 pop %1 %ifidn rstk, rsp %assign stack_offset stack_offset-gprsize %endif %endmacro %macro PUSH_IF_USED 1-* %rep %0 %if %1 < regs_used PUSH r%1 %endif %rotate 1 %endrep %endmacro %macro POP_IF_USED 1-* %rep %0 %if %1 < regs_used pop r%1 %endif %rotate 1 %endrep %endmacro %macro LOAD_IF_USED 1-* %rep %0 %if %1 < num_args mov r%1, r %+ %1 %+ mp %endif %rotate 1 %endrep %endmacro %macro SUB 2 sub %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro %macro movifnidn 2 %ifnidn %1, %2 mov %1, %2 %endif %endmacro %if ARCH_X86_64 == 0 %define movsxd movifnidn %endif %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 %endif %endmacro %macro ASSERT 1 %if (%1) == 0 %error assertion ``%1'' failed %endif %endmacro %macro DEFINE_ARGS 0-* %ifdef n_arg_names %assign %%i 0 %rep n_arg_names CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep %endif %xdefine %%stack_offset stack_offset %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1 %endrep %xdefine stack_offset %%stack_offset %assign n_arg_names %0 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) ; Large stack allocations on Windows need to use stack probing in order ; to guarantee that all stack memory is committed before accessing it. ; This is done by ensuring that the guard page(s) at the end of the ; currently committed pages are touched prior to any pages beyond that. %if WIN64 %assign STACK_PROBE_SIZE 8192 %elifidn __OUTPUT_FORMAT__, win32 %assign STACK_PROBE_SIZE 4096 %else %assign STACK_PROBE_SIZE 0 %endif %macro PROBE_STACK 1 ; stack_size %if STACK_PROBE_SIZE %assign %%i STACK_PROBE_SIZE %rep %1 / STACK_PROBE_SIZE mov eax, [rsp-%%i] %assign %%i %%i+STACK_PROBE_SIZE %endrep %endif %endmacro %macro RESET_STACK_STATE 0 %ifidn rstk, rsp %assign stack_offset stack_offset - stack_size_padded %else %xdefine rstk rsp %endif %assign stack_size 0 %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs RESET_STACK_STATE %ifnum %2 %if mmsize != 8 %assign xmm_regs_used %2 %endif %endif %ifnum %1 %if %1 != 0 %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space %if xmm_regs_used > 8 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) PROBE_STACK stack_size_padded SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) %xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) %if %1 < 0 ; need to store rsp on stack %xdefine rstkm [rsp + stack_size + %%pad] %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) PROBE_STACK stack_size_padded mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif %endif %endmacro %macro SETUP_STACK_POINTER 0-1 0 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 ; Reserve an additional register for storing the original stack pointer, but avoid using ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) %if ARCH_X86_64 && regs_used == 7 %assign regs_used 8 %elif ARCH_X86_64 == 0 && regs_used == 1 %assign regs_used 2 %endif %endif %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. %assign regs_used 5 + UNIX64 * 3 %endif %endif %endif %endmacro %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx DECLARE_REG 1, rdx DECLARE_REG 2, R8 DECLARE_REG 3, R9 DECLARE_REG 4, R10, 40 DECLARE_REG 5, R11, 48 DECLARE_REG 6, rax, 56 DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 DECLARE_REG 11, R14, 96 DECLARE_REG 12, R15, 104 DECLARE_REG 13, R12, 112 DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro ; Push XMM registers to the stack. If no argument is specified all used register ; will be pushed, otherwise only push previously unpushed registers. %macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed %if mmsize != 8 %if %0 == 2 %assign %%pushed %2 %assign xmm_regs_used %1 %elif %0 == 1 %assign %%pushed xmm_regs_used %assign xmm_regs_used %1 %else %assign %%pushed 0 %endif ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif %assign %%pushed %%pushed - high_mm_regs - 8 %if %%pushed < 0 %assign %%pushed 0 %endif %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 %if %%regs_to_push > 0 ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 %assign %%i %%pushed + 8 %rep %%regs_to_push movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep %endif %endif %endmacro ; Allocated stack space for XMM registers and push all, or a subset, of those %macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved RESET_STACK_STATE %if mmsize != 8 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 + high_mm_regs %if %0 == 2 ASSERT %2 >= %1 %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 %else %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 %endif %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endif %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 %assign %%i xmm_regs_used - high_mm_regs %rep %%xmm_regs_on_stack %assign %%i %%i-1 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 + high_mm_regs movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 + high_mm_regs movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro %macro WIN64_RESTORE_XMM 0 WIN64_RESTORE_XMM_INTERNAL RESET_STACK_STATE %endmacro %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi DECLARE_REG 2, rdx DECLARE_REG 3, rcx DECLARE_REG 4, R8 DECLARE_REG 5, R9 DECLARE_REG 6, rax, 8 DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 DECLARE_REG 11, R14, 48 DECLARE_REG 12, R15, 56 DECLARE_REG 13, R12, 64 DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== DECLARE_REG 0, eax, 4 DECLARE_REG 1, ecx, 8 DECLARE_REG 2, edx, 12 DECLARE_REG 3, ebx, 16 DECLARE_REG 4, esi, 20 DECLARE_REG 5, edi, 24 DECLARE_REG 6, ebp, 28 %define rsp esp %macro DECLARE_ARG 1-* %rep %0 %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep %endmacro DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args %if num_args > 7 %assign num_args 7 %endif %if regs_used > 7 %assign regs_used 7 %endif SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4, %3 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 6, 5, 4, 3 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 %macro WIN64_SPILL_XMM 1-2 RESET_STACK_STATE %if mmsize != 8 %assign xmm_regs_used %1 %endif %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 %endmacro %macro WIN64_RESTORE_XMM 0 RESET_STACK_STATE %endmacro %macro WIN64_PUSH_XMM 0-2 %if mmsize != 8 && %0 >= 1 %assign xmm_regs_used %1 %endif %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue || cpuflag(ssse3) RET %else rep ret %endif annotate_function_size %endmacro %define last_branch_adr $$ %macro AUTO_REP_RET 0 %if notcpuflag(ssse3) times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. %endif ret annotate_function_size %endmacro %macro BRANCH_INSTR 0-* %rep %0 %macro %1 1-2 %1 %2 %1 %if notcpuflag(ssse3) %%branch_instr equ $ %xdefine last_branch_adr %%branch_instr %endif %endmacro %rotate 1 %endrep %endmacro BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent %if has_epilogue call %1 RET %elif %2 jmp %1 %endif annotate_function_size %endmacro ;============================================================================= ; arch-independent part ;============================================================================= %assign function_align 16 ; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro %macro cvisible 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 0, %1 %+ SUFFIX, %2 %endmacro %macro cglobal_internal 2-3+ annotate_function_size %ifndef cglobaled_%2 %if %1 %xdefine %2 mangle(private_prefix %+ _ %+ %2) %else %xdefine %2 mangle(public_prefix %+ _ %+ %2) %endif %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %xdefine current_function_section __SECT__ %if FORMAT_ELF %if %1 global %2:function hidden %else global %2:function %endif %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 global %2:private_extern %else global %2 %endif align function_align %2: RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif %endmacro ; Create a global symbol from a local label with the correct name mangling and type %macro cglobal_label 1 %if FORMAT_ELF global current_function %+ %1:function hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global current_function %+ %1:private_extern %else global current_function %+ %1 %endif %1: %endmacro %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 2 extern %1 %endmacro ; Like cextern, but without the prefix. This should be used for symbols from external libraries. %macro cextern_naked 1 %ifdef PREFIX %xdefine %1 mangle(%1) %endif CAT_XDEFINE cglobaled_, %1, 3 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %if FORMAT_ELF global %1:data hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global %1:private_extern %else global %1 %endif %1: %2 %endmacro %if FORMAT_ELF ; The GNU linker assumes the stack is executable by default. [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %ifdef __NASM_VERSION_ID__ %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 %if ARCH_X86_64 ; Control-flow Enforcement Technology (CET) properties. [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] dd 0x00000004 ; n_namesz dd gprsize + 8 ; n_descsz dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 db "GNU",0 ; n_name dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND dd 0x00000004 ; pr_datasz dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK dd 0x00000000 ; pr_padding %endif %endif %endif %endif ; Tell debuggers how large the function was. ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, ; then its size might be unspecified. %macro annotate_function_size 0 %ifdef __YASM_VER__ %ifdef current_function %if FORMAT_ELF current_function_section %%ecf equ $ size current_function %%ecf - current_function __SECT__ %endif %endif %endif %endmacro ; cpuflags %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 %assign cpuflags_clmul (1<<13) | cpuflags_sse42 %assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul %assign cpuflags_avx (1<<15) | cpuflags_sse42 %assign cpuflags_xop (1<<16) | cpuflags_avx %assign cpuflags_fma4 (1<<17) | cpuflags_avx %assign cpuflags_fma3 (1<<18) | cpuflags_avx %assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 %assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 %assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL %assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ %assign cpuflags_cache32 (1<<24) %assign cpuflags_cache64 (1<<25) %assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<27) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) %define notcpuflag(x) (cpuflag(x) ^ 1) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-* %xdefine SUFFIX %undef cpuname %assign cpuflags 0 %if %0 >= 1 %rep %0 %ifdef cpuname %xdefine cpuname cpuname %+ _%1 %else %xdefine cpuname %1 %endif %assign cpuflags cpuflags | cpuflags_%1 %rotate 1 %endrep %xdefine SUFFIX _ %+ cpuname %if cpuflag(avx) %assign avx_enabled 1 %endif %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps %endif %if cpuflag(aligned) %define movu mova %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif %endif %if ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VERSION_ID__ ALIGNMODE p6 %else CPU amdnop %endif %else %ifdef __NASM_VERSION_ID__ ALIGNMODE nop %else CPU basicnop %endif %endif %endmacro ; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# ; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 %endmacro %macro CAT_UNDEF 2 %undef %1%2 %endmacro %macro DEFINE_MMREGS 1 ; mmtype %assign %%prev_mmregs 0 %ifdef num_mmregs %assign %%prev_mmregs num_mmregs %endif %assign num_mmregs 8 %if ARCH_X86_64 && mmsize >= 16 %assign num_mmregs 16 %if cpuflag(avx512) || mmsize == 64 %assign num_mmregs 32 %endif %endif %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1 %+ %%i CAT_XDEFINE nn%1, %%i, %%i %assign %%i %%i+1 %endrep %if %%prev_mmregs > num_mmregs %rep %%prev_mmregs - num_mmregs CAT_UNDEF m, %%i CAT_UNDEF nn %+ mmtype, %%i %assign %%i %%i+1 %endrep %endif %xdefine mmtype %1 %endmacro ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg %if ARCH_X86_64 && cpuflag(avx512) %assign %%i %1 %rep 16-%1 %assign %%i_high %%i+16 SWAP %%i, %%i_high %assign %%i %%i+1 %endrep %endif %endmacro %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 %define mova movq %define movu movq %define movh movd %define movnta movntq INIT_CPUFLAGS %1 DEFINE_MMREGS mm %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled FORCE_VEX_ENCODING %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS xmm %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif %xdefine bcstw 1to8 %xdefine bcstd 1to4 %xdefine bcstq 1to2 %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION %xdefine bcstw 1to16 %xdefine bcstd 1to8 %xdefine bcstq 1to4 %endmacro %macro INIT_ZMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_ZMM %1 %define mmsize 64 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION %xdefine bcstw 1to32 %xdefine bcstd 1to16 %xdefine bcstq 1to8 %endmacro INIT_XMM %macro DECLARE_MMCAST 1 %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 %define ymmzmm%1 ymm%1 %define zmmmm%1 mm%1 %define zmmxmm%1 xmm%1 %define zmmymm%1 ymm%1 %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %define zm%1 zmm %+ m%1 %endmacro %assign i 0 %rep 32 DECLARE_MMCAST i %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 %xdefine m%1 %%tmp%2 CAT_XDEFINE nn, m%1, %1 %rotate 2 %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) %ifnum %1 ; SWAP 0, 1, ... SWAP_INTERNAL_NUM %1, %2 %else ; SWAP m0, m1, ... SWAP_INTERNAL_NAME %1, %2 %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* %rep %0-1 %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp CAT_XDEFINE nn, m%1, %1 CAT_XDEFINE nn, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* %xdefine %%args nn %+ %1 %rep %0-1 %xdefine %%args %%args, nn %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs. %macro SAVE_MM_PERMUTATION 0-1 %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %assign %%i 0 %rep num_mmregs %xdefine %%tmp m %+ %%i CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp %assign %%i %%i+1 %endrep %endmacro %macro LOAD_MM_PERMUTATION 0-1 ; name to load from %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %xdefine %%tmp %%f %+ 0 %ifnum %%tmp DEFINE_MMREGS mmtype %assign %%i 0 %rep num_mmregs %xdefine %%tmp %%f %+ %%i CAT_XDEFINE %%m, %%i, m %+ %%tmp %assign %%i %%i+1 %endrep %rep num_mmregs %assign %%i %%i-1 CAT_XDEFINE m, %%i, %%m %+ %%i CAT_XDEFINE nn, m %+ %%i, %%i %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 %ifid %1 call_internal %1 %+ SUFFIX, %1 %else call %1 %endif %endmacro %macro call_internal 2 %xdefine %%i %2 %define %%j %%i %ifndef cglobaled_%2 %ifdef cglobaled_%1 %xdefine %%i %1 %endif %elif FORMAT_ELF %if ARCH_X86_64 %if cglobaled_%2 >= 2 ; Always emit PLT relocations when calling external functions, ; the linker will eliminate unnecessary PLT indirections anyway. %define %%j %%i wrt ..plt %endif %elif PIC && cglobaled_%2 == 3 ; Go through the GOT for functions declared using cextern_naked with ; PIC, as such functions presumably exists in external libraries. extern _GLOBAL_OFFSET_TABLE_ LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc %define %%j [eax+%%i wrt ..got] %endif %endif call %%j LOAD_MM_PERMUTATION %%i %endmacro ; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 sub %1, -128 %else add %1, %2 %endif %else add %1, %2 %endif %endmacro %macro sub 2 %ifnum %2 %if %2==128 add %1, -128 %else sub %1, %2 %endif %else sub %1, %2 %endif %endmacro ;============================================================================= ; AVX abstraction layer ;============================================================================= %assign i 0 %rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 CAT_XDEFINE sizeofzmm, i, 64 CAT_XDEFINE regnumofxmm, i, i CAT_XDEFINE regnumofymm, i, i CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i %macro CHECK_AVX_INSTR_EMU 3-* %xdefine %%opcode %1 %xdefine %%dst %2 %rep %0-2 %ifidn %%dst, %3 %error non-avx emulation of ``%%opcode'' is not supported %endif %rotate 1 %endrep %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ %ifnum sizeof%7 %assign __sizeofreg sizeof%7 %elifnum sizeof%6 %assign __sizeofreg sizeof%6 %else %assign __sizeofreg mmsize %endif %assign __emulate_avx 0 %if avx_enabled && __sizeofreg >= 16 %xdefine __instr v%1 %else %xdefine __instr %1 %if %0 >= 8+%4 %assign __emulate_avx 1 %endif %endif %ifnidn %2, fnord %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) %error use of ``%1'' sse2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) %error use of ``%1'' avx2 instruction in cpuname function: current_function %elif __sizeofreg == 16 && notcpuflag(sse) %error use of ``%1'' sse instruction in cpuname function: current_function %elif __sizeofreg == 32 && notcpuflag(avx) %error use of ``%1'' avx instruction in cpuname function: current_function %elif __sizeofreg == 64 && notcpuflag(avx512) %error use of ``%1'' avx512 instruction in cpuname function: current_function %elifidn %1, pextrw ; special case because the base instruction is mmx2, %ifnid %6 ; but sse4 is required for memory operands %if notcpuflag(sse4) %error use of ``%1'' sse4 instruction in cpuname function: current_function %endif %endif %endif %endif %endif %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 %if %5 && %4 == 0 %ifnidn %6, %7 %ifidn %6, %8 %xdefine __src1 %8 %xdefine __src2 %7 %elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %endif %ifnidn %6, __src1 %if %0 >= 9 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 %else CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 MOVAPS %6, __src1 %else MOVDQA %6, __src1 %endif %endif %if %0 >= 9 %1 %6, __src2, %9 %else %1 %6, __src2 %endif %elif %0 >= 9 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 %ifnnum regnumof%7 %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8, %9 %else __instr %6, %7, %8, %9 %endif %else __instr %6, %7, %8, %9 %endif %elif %0 == 8 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 %if %5 %ifnum regnumof%7 %ifnum regnumof%8 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 ; Most VEX-encoded instructions require an additional byte to encode when ; src2 is a high register (e.g. m8..15). If the instruction is commutative ; we can swap src1 and src2 when doing so reduces the instruction length. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %elifnum regnumof%8 ; put memory operands in src2 when possible %xdefine __src1 %8 %xdefine __src2 %7 %else %assign __emulate_avx 1 %endif %elifnnum regnumof%7 ; EVEX allows imm8 shift instructions to be used with memory operands, ; but VEX does not. This handles those special cases. %ifnnum %8 %assign __emulate_avx 1 %elif notcpuflag(avx512) %assign __emulate_avx 1 %endif %endif %if __emulate_avx ; a separate load is required %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8 %else __instr %6, __src1, __src2 %endif %else __instr %6, %7, %8 %endif %elif %0 == 7 %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6 %ifnum regnumof%7 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 %xdefine __src1 %7 %xdefine __src2 %6 %endif %endif %endif __instr %6, __src1, __src2 %else __instr %6, %7 %endif %else __instr %6 %endif %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not %macro AVX_INSTR 1-5 fnord, 0, 255, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 %elifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 %elifidn %4, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 %else RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 %endif %endmacro %endmacro ; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 AVX_INSTR addsd, sse2, 1, 0, 0 AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, aesni, 0, 0, 0 AVX_INSTR aesdeclast, aesni, 0, 0, 0 AVX_INSTR aesenc, aesni, 0, 0, 0 AVX_INSTR aesenclast, aesni, 0, 0, 0 AVX_INSTR aesimc, aesni AVX_INSTR aeskeygenassist, aesni AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR cmpeqpd, sse2, 1, 0, 1 AVX_INSTR cmpeqps, sse, 1, 0, 1 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 AVX_INSTR cmpeqss, sse, 1, 0, 0 AVX_INSTR cmplepd, sse2, 1, 0, 0 AVX_INSTR cmpleps, sse, 1, 0, 0 AVX_INSTR cmplesd, sse2, 1, 0, 0 AVX_INSTR cmpless, sse, 1, 0, 0 AVX_INSTR cmpltpd, sse2, 1, 0, 0 AVX_INSTR cmpltps, sse, 1, 0, 0 AVX_INSTR cmpltsd, sse2, 1, 0, 0 AVX_INSTR cmpltss, sse, 1, 0, 0 AVX_INSTR cmpneqpd, sse2, 1, 0, 1 AVX_INSTR cmpneqps, sse, 1, 0, 1 AVX_INSTR cmpneqsd, sse2, 1, 0, 0 AVX_INSTR cmpneqss, sse, 1, 0, 0 AVX_INSTR cmpnlepd, sse2, 1, 0, 0 AVX_INSTR cmpnleps, sse, 1, 0, 0 AVX_INSTR cmpnlesd, sse2, 1, 0, 0 AVX_INSTR cmpnless, sse, 1, 0, 0 AVX_INSTR cmpnltpd, sse2, 1, 0, 0 AVX_INSTR cmpnltps, sse, 1, 0, 0 AVX_INSTR cmpnltsd, sse2, 1, 0, 0 AVX_INSTR cmpnltss, sse, 1, 0, 0 AVX_INSTR cmpordpd, sse2 1, 0, 1 AVX_INSTR cmpordps, sse 1, 0, 1 AVX_INSTR cmpordsd, sse2 1, 0, 0 AVX_INSTR cmpordss, sse 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 AVX_INSTR cmpunordpd, sse2, 1, 0, 1 AVX_INSTR cmpunordps, sse, 1, 0, 1 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 AVX_INSTR cmpunordss, sse, 1, 0, 0 AVX_INSTR comisd, sse2, 1 AVX_INSTR comiss, sse, 1 AVX_INSTR cvtdq2pd, sse2, 1 AVX_INSTR cvtdq2ps, sse2, 1 AVX_INSTR cvtpd2dq, sse2, 1 AVX_INSTR cvtpd2ps, sse2, 1 AVX_INSTR cvtps2dq, sse2, 1 AVX_INSTR cvtps2pd, sse2, 1 AVX_INSTR cvtsd2si, sse2, 1 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 AVX_INSTR cvtss2si, sse, 1 AVX_INSTR cvttpd2dq, sse2, 1 AVX_INSTR cvttps2dq, sse2, 1 AVX_INSTR cvttsd2si, sse2, 1 AVX_INSTR cvttss2si, sse, 1 AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4, 1 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 AVX_INSTR ldmxcsr, sse, 1 AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 AVX_INSTR maxsd, sse2, 1, 0, 0 AVX_INSTR maxss, sse, 1, 0, 0 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 0 AVX_INSTR minss, sse, 1, 0, 0 AVX_INSTR movapd, sse2, 1 AVX_INSTR movaps, sse, 1 AVX_INSTR movd, mmx AVX_INSTR movddup, sse3, 1 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 AVX_INSTR movhpd, sse2, 1, 0, 0 AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 AVX_INSTR movmskpd, sse2, 1 AVX_INSTR movmskps, sse, 1 AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2, 1 AVX_INSTR movntps, sse, 1 AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3, 1 AVX_INSTR movsldup, sse3, 1 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2, 1 AVX_INSTR movups, sse, 1 AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 AVX_INSTR mulsd, sse2, 1, 0, 0 AVX_INSTR mulss, sse, 1, 0, 0 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 AVX_INSTR pclmulqdq, clmul, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpistri, sse42 AVX_INSTR pcmpistrm, sse42 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR pshufb, ssse3, 0, 0, 0 AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4, 1 AVX_INSTR roundps, sse4, 1 AVX_INSTR roundsd, sse4, 1, 1, 0 AVX_INSTR roundss, sse4, 1, 1, 0 AVX_INSTR rsqrtps, sse, 1 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 AVX_INSTR sqrtpd, sse2, 1 AVX_INSTR sqrtps, sse, 1 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse, 1 AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 AVX_INSTR ucomisd, sse2, 1 AVX_INSTR ucomiss, sse, 1 AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 AVX_INSTR unpcklps, sse, 1, 0, 0 AVX_INSTR xorpd, sse2, 1, 0, 1 AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfmul, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 ;%1 == instruction ;%2 == minimal instruction set %macro GPR_INSTR 2 %macro %1 2-5 fnord, %1, %2 %ifdef cpuname %if notcpuflag(%5) %error use of ``%4'' %5 instruction in cpuname function: current_function %endif %endif %ifidn %3, fnord %4 %1, %2 %else %4 %1, %2, %3 %endif %endmacro %endmacro GPR_INSTR andn, bmi1 GPR_INSTR bextr, bmi1 GPR_INSTR blsi, bmi1 GPR_INSTR blsmsk, bmi1 GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 GPR_INSTR crc32, sse42 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 GPR_INSTR pext, bmi2 GPR_INSTR popcnt, sse42 GPR_INSTR rorx, bmi2 GPR_INSTR sarx, bmi2 GPR_INSTR shlx, bmi2 GPR_INSTR shrx, bmi2 ; base-4 constants for shuffles %assign i 0 %rep 256 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) %if j < 10 CAT_XDEFINE q000, j, i %elif j < 100 CAT_XDEFINE q00, j, i %elif j < 1000 CAT_XDEFINE q0, j, i %else CAT_XDEFINE q, j, i %endif %assign i i+1 %endrep %undef i %undef j %macro FMA_INSTR 3 %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 %else %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. ; FMA3 is only possible if dst is the same as one of the src registers. ; Either src2 or src3 can be a memory operand. %macro FMA4_INSTR 2-* %push fma4_instr %xdefine %$prefix %1 %rep %0 - 1 %macro %$prefix%2 4-6 %$prefix, %2 %if notcpuflag(fma3) && notcpuflag(fma4) %error use of ``%5%6'' fma instruction in cpuname function: current_function %elif cpuflag(fma4) v%5%6 %1, %2, %3, %4 %elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. %ifnum sizeof%3 v%{5}213%6 %2, %3, %4 %else v%{5}132%6 %2, %4, %3 %endif %elifidn %1, %3 v%{5}213%6 %3, %2, %4 %elifidn %1, %4 v%{5}231%6 %4, %2, %3 %else %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported %endif %endmacro %rotate 1 %endrep %pop %endmacro FMA4_INSTR fmadd, pd, ps, sd, ss FMA4_INSTR fmaddsub, pd, ps FMA4_INSTR fmsub, pd, ps, sd, ss FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss ; Macros for converting VEX instructions to equivalent EVEX ones. %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex %macro %1 2-7 fnord, fnord, %1, %2, %3 %ifidn %3, fnord %define %%args %1, %2 %elifidn %4, fnord %define %%args %1, %2, %3 %else %define %%args %1, %2, %3, %4 %endif %assign %%evex_required cpuflag(avx512) & %7 %ifnum regnumof%1 %if regnumof%1 >= 16 || sizeof%1 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%2 %if regnumof%2 >= 16 || sizeof%2 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%3 %if regnumof%3 >= 16 || sizeof%3 > 32 %assign %%evex_required 1 %endif %endif %if %%evex_required %6 %%args %else %5 %%args ; Prefer VEX over EVEX due to shorter instruction length %endif %endmacro %endmacro EVEX_INSTR vbroadcastf128, vbroadcastf32x4 EVEX_INSTR vbroadcasti128, vbroadcasti32x4 EVEX_INSTR vextractf128, vextractf32x4 EVEX_INSTR vextracti128, vextracti32x4 EVEX_INSTR vinsertf128, vinsertf32x4 EVEX_INSTR vinserti128, vinserti32x4 EVEX_INSTR vmovdqa, vmovdqa32 EVEX_INSTR vmovdqu, vmovdqu32 EVEX_INSTR vpand, vpandd EVEX_INSTR vpandn, vpandnd EVEX_INSTR vpor, vpord EVEX_INSTR vpxor, vpxord EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision EVEX_INSTR vrcpss, vrcp14ss, 1 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 x264-master/common/x86/x86util.asm000066400000000000000000000517501502133446700170310ustar00rootroot00000000000000;***************************************************************************** ;* x86util.asm: x86 utility macros ;***************************************************************************** ;* Copyright (C) 2008-2025 x264 project ;* ;* Authors: Holger Lubitz ;* Loren Merritt ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** ; like cextern, but with a plain x264 prefix instead of a bitdepth-specific one %macro cextern_common 1 %xdefine %1 mangle(x264 %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %ifndef BIT_DEPTH %assign BIT_DEPTH 0 %endif %if BIT_DEPTH > 8 %assign HIGH_BIT_DEPTH 1 %else %assign HIGH_BIT_DEPTH 0 %endif %assign FENC_STRIDE 16 %assign FDEC_STRIDE 32 %assign SIZEOF_PIXEL 1 %assign SIZEOF_DCTCOEF 2 %define pixel byte %define vpbroadcastdct vpbroadcastw %define vpbroadcastpix vpbroadcastb %if HIGH_BIT_DEPTH %assign SIZEOF_PIXEL 2 %assign SIZEOF_DCTCOEF 4 %define pixel word %define vpbroadcastdct vpbroadcastd %define vpbroadcastpix vpbroadcastw %endif %assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE %assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE %assign PIXEL_MAX ((1 << BIT_DEPTH)-1) %macro FIX_STRIDES 1-* %if HIGH_BIT_DEPTH %rep %0 add %1, %1 %rotate 1 %endrep %endif %endmacro %macro SBUTTERFLY 4 %ifidn %1, dqqq vperm2i128 m%4, m%2, m%3, q0301 ; punpckh vinserti128 m%2, m%2, xm%3, 1 ; punpckl %elif avx_enabled && mmsize >= 16 punpckh%1 m%4, m%2, m%3 punpckl%1 m%2, m%3 %else mova m%4, m%2 punpckl%1 m%2, m%3 punpckh%1 m%4, m%3 %endif SWAP %3, %4 %endmacro %macro SBUTTERFLY2 4 punpckl%1 m%4, m%2, m%3 punpckh%1 m%2, m%2, m%3 SWAP %2, %4, %3 %endmacro %macro TRANSPOSE4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 SBUTTERFLY dq, %1, %3, %5 SBUTTERFLY dq, %2, %4, %5 SWAP %2, %3 %endmacro %macro TRANSPOSE2x4x4W 5 SBUTTERFLY wd, %1, %2, %5 SBUTTERFLY wd, %3, %4, %5 SBUTTERFLY dq, %1, %3, %5 SBUTTERFLY dq, %2, %4, %5 SBUTTERFLY qdq, %1, %2, %5 SBUTTERFLY qdq, %3, %4, %5 %endmacro %macro TRANSPOSE4x4D 5 SBUTTERFLY dq, %1, %2, %5 SBUTTERFLY dq, %3, %4, %5 SBUTTERFLY qdq, %1, %3, %5 SBUTTERFLY qdq, %2, %4, %5 SWAP %2, %3 %endmacro %macro TRANSPOSE8x8W 9-11 %if ARCH_X86_64 SBUTTERFLY wd, %1, %2, %9 SBUTTERFLY wd, %3, %4, %9 SBUTTERFLY wd, %5, %6, %9 SBUTTERFLY wd, %7, %8, %9 SBUTTERFLY dq, %1, %3, %9 SBUTTERFLY dq, %2, %4, %9 SBUTTERFLY dq, %5, %7, %9 SBUTTERFLY dq, %6, %8, %9 SBUTTERFLY qdq, %1, %5, %9 SBUTTERFLY qdq, %2, %6, %9 SBUTTERFLY qdq, %3, %7, %9 SBUTTERFLY qdq, %4, %8, %9 SWAP %2, %5 SWAP %4, %7 %else ; in: m0..m7, unless %11 in which case m6 is in %9 ; out: m0..m7, unless %11 in which case m4 is in %10 ; spills into %9 and %10 %if %0<11 movdqa %9, m%7 %endif SBUTTERFLY wd, %1, %2, %7 movdqa %10, m%2 movdqa m%7, %9 SBUTTERFLY wd, %3, %4, %2 SBUTTERFLY wd, %5, %6, %2 SBUTTERFLY wd, %7, %8, %2 SBUTTERFLY dq, %1, %3, %2 movdqa %9, m%3 movdqa m%2, %10 SBUTTERFLY dq, %2, %4, %3 SBUTTERFLY dq, %5, %7, %3 SBUTTERFLY dq, %6, %8, %3 SBUTTERFLY qdq, %1, %5, %3 SBUTTERFLY qdq, %2, %6, %3 movdqa %10, m%2 movdqa m%3, %9 SBUTTERFLY qdq, %3, %7, %2 SBUTTERFLY qdq, %4, %8, %2 SWAP %2, %5 SWAP %4, %7 %if %0<11 movdqa m%5, %10 %endif %endif %endmacro %macro WIDEN_SXWD 2 punpckhwd m%2, m%1 psrad m%2, 16 %if cpuflag(sse4) pmovsxwd m%1, m%1 %else punpcklwd m%1, m%1 psrad m%1, 16 %endif %endmacro %macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src) %if cpuflag(ssse3) pabsw %1, %2 %elifidn %3, sign ; version for pairing with PSIGNW: modifies src pxor %1, %1 pcmpgtw %1, %2 pxor %2, %1 psubw %2, %1 SWAP %1, %2 %elifidn %1, %2 pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 %elifid %2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %elif %0 == 2 pxor %1, %1 psubw %1, %2 pmaxsw %1, %2 %else mova %1, %2 pxor %3, %3 psubw %3, %1 pmaxsw %1, %3 %endif %endmacro %macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp %if cpuflag(ssse3) pabsw %1, %3 pabsw %2, %4 %elifidn %1, %3 pxor %5, %5 pxor %6, %6 psubw %5, %1 psubw %6, %2 pmaxsw %1, %5 pmaxsw %2, %6 %else pxor %1, %1 pxor %2, %2 psubw %1, %3 psubw %2, %4 pmaxsw %1, %3 pmaxsw %2, %4 %endif %endmacro %macro ABSB 2 %if cpuflag(ssse3) pabsb %1, %1 %else pxor %2, %2 psubb %2, %1 pminub %1, %2 %endif %endmacro %macro ABSD 2-3 %if cpuflag(ssse3) pabsd %1, %2 %else %define %%s %2 %if %0 == 3 mova %3, %2 %define %%s %3 %endif pxor %1, %1 pcmpgtd %1, %%s pxor %%s, %1 psubd %%s, %1 SWAP %1, %%s %endif %endmacro %macro PSIGN 3-4 %if cpuflag(ssse3) && %0 == 4 psign%1 %2, %3, %4 %elif cpuflag(ssse3) psign%1 %2, %3 %elif %0 == 4 pxor %2, %3, %4 psub%1 %2, %4 %else pxor %2, %3 psub%1 %2, %3 %endif %endmacro %define PSIGNW PSIGN w, %define PSIGND PSIGN d, %macro SPLATB_LOAD 3 %if cpuflag(ssse3) movd %1, [%2-3] pshufb %1, %3 %else movd %1, [%2-3] ;to avoid crossing a cacheline punpcklbw %1, %1 SPLATW %1, %1, 3 %endif %endmacro %imacro SPLATW 2-3 0 %if cpuflag(avx2) && %3 == 0 vpbroadcastw %1, %2 %else %define %%s %2 %ifid %2 %define %%s xmm%2 %elif %3 == 0 movd xmm%1, %2 %define %%s xmm%1 %endif PSHUFLW xmm%1, %%s, (%3)*q1111 %if mmsize >= 32 vpbroadcastq %1, xmm%1 %elif mmsize == 16 punpcklqdq %1, %1 %endif %endif %endmacro %imacro SPLATD 2-3 0 %if cpuflag(avx2) && %3 == 0 vpbroadcastd %1, %2 %else %define %%s %2 %ifid %2 %define %%s xmm%2 %elif %3 == 0 movd xmm%1, %2 %define %%s xmm%1 %endif %if mmsize == 8 && %3 == 0 %ifidn %1, %%s punpckldq %1, %1 %else pshufw %1, %%s, q1010 %endif %elif mmsize == 8 && %3 == 1 %ifidn %1, %%s punpckhdq %1, %1 %else pshufw %1, %%s, q3232 %endif %else pshufd xmm%1, %%s, (%3)*q1111 %endif %if mmsize >= 32 vpbroadcastq %1, xmm%1 %endif %endif %endmacro %macro CLIPW 3 ;(dst, min, max) pmaxsw %1, %2 pminsw %1, %3 %endmacro %macro MOVHL 2 ; dst, src %ifidn %1, %2 punpckhqdq %1, %2 %elif cpuflag(avx) punpckhqdq %1, %2, %2 %elif cpuflag(sse4) pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones %else movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst %endif %endmacro %macro HADDD 2 ; sum junk %if sizeof%1 >= 64 vextracti32x8 ymm%2, zmm%1, 1 paddd ymm%1, ymm%2 %endif %if sizeof%1 >= 32 vextracti128 xmm%2, ymm%1, 1 paddd xmm%1, xmm%2 %endif %if sizeof%1 >= 16 MOVHL xmm%2, xmm%1 paddd xmm%1, xmm%2 %endif %if cpuflag(xop) && sizeof%1 == 16 vphadddq xmm%1, xmm%1 %else PSHUFLW xmm%2, xmm%1, q1032 paddd xmm%1, xmm%2 %endif %endmacro %macro HADDW 2 ; reg, tmp %if cpuflag(xop) && sizeof%1 == 16 vphaddwq %1, %1 MOVHL %2, %1 paddd %1, %2 %else pmaddwd %1, [pw_1] HADDD %1, %2 %endif %endmacro %macro HADDUWD 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwd %1, %1 %else psrld %2, %1, 16 pslld %1, 16 psrld %1, 16 paddd %1, %2 %endif %endmacro %macro HADDUW 2 %if cpuflag(xop) && sizeof%1 == 16 vphadduwq %1, %1 MOVHL %2, %1 paddd %1, %2 %else HADDUWD %1, %2 HADDD %1, %2 %endif %endmacro %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp ; AVX2 version uses a precalculated extra input that ; can be re-used across calls %if sizeof%1==32 ; %3 = abcdefgh ijklmnop (lower address) ; %2 = ABCDEFGH IJKLMNOP (higher address) ; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH %if %4 < 16 palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA %else palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO %endif %elif cpuflag(ssse3) %if %0==5 palignr %1, %2, %3, %4 %else palignr %1, %2, %3 %endif %else %define %%dst %1 %if %0==5 %ifnidn %1, %2 mova %%dst, %2 %endif %rotate 1 %endif %ifnidn %4, %2 mova %4, %2 %endif %if mmsize==8 psllq %%dst, (8-%3)*8 psrlq %4, %3*8 %else pslldq %%dst, 16-%3 psrldq %4, %3 %endif por %%dst, %4 %endif %endmacro %macro PSHUFLW 1+ %if mmsize == 8 pshufw %1 %else pshuflw %1 %endif %endmacro ; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes ; values shifted in are undefined ; faster if dst==src %define PSLLPIX PSXLPIX l, -1, ;dst, src, shift %define PSRLPIX PSXLPIX r, 1, ;dst, src, shift %macro PSXLPIX 5 %if mmsize == 8 %if %5&1 ps%1lq %3, %4, %5*8 %else pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff %endif %else ps%1ldq %3, %4, %5*2 %endif %endmacro %macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from %ifnum %5 pand m%3, m%5, m%4 ; src .. y6 .. y4 pand m%1, m%5, m%2 ; dst .. y6 .. y4 %else mova m%1, %5 pand m%3, m%1, m%4 ; src .. y6 .. y4 pand m%1, m%1, m%2 ; dst .. y6 .. y4 %endif psrlw m%2, 8 ; dst .. y7 .. y5 psrlw m%4, 8 ; src .. y7 .. y5 %endmacro %macro SUMSUB_BA 3-4 %if %0==3 padd%1 m%2, m%3 padd%1 m%3, m%3 psub%1 m%3, m%2 %elif avx_enabled padd%1 m%4, m%2, m%3 psub%1 m%3, m%2 SWAP %2, %4 %else mova m%4, m%2 padd%1 m%2, m%3 psub%1 m%3, m%4 %endif %endmacro %macro SUMSUB_BADC 5-6 %if %0==6 SUMSUB_BA %1, %2, %3, %6 SUMSUB_BA %1, %4, %5, %6 %else padd%1 m%2, m%3 padd%1 m%4, m%5 padd%1 m%3, m%3 padd%1 m%5, m%5 psub%1 m%3, m%2 psub%1 m%5, m%4 %endif %endmacro %macro HADAMARD4_V 4+ SUMSUB_BADC w, %1, %2, %3, %4 SUMSUB_BADC w, %1, %3, %2, %4 %endmacro %macro HADAMARD8_V 8+ SUMSUB_BADC w, %1, %2, %3, %4 SUMSUB_BADC w, %5, %6, %7, %8 SUMSUB_BADC w, %1, %3, %2, %4 SUMSUB_BADC w, %5, %7, %6, %8 SUMSUB_BADC w, %1, %5, %2, %6 SUMSUB_BADC w, %3, %7, %4, %8 %endmacro %macro TRANS_SSE2 5-6 ; TRANSPOSE2x2 ; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq ; %2: ord/unord (for compat with sse4, unused) ; %3/%4: source regs ; %5/%6: tmp regs %ifidn %1, d %define mask [mask_10] %define shift 16 %elifidn %1, q %define mask [mask_1100] %define shift 32 %endif %if %0==6 ; less dependency if we have two tmp mova m%5, mask ; ff00 mova m%6, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pand m%6, m%5 ; x5.. pandn m%5, m%3 ; ..x0 psrl%1 m%3, shift ; ..x1 por m%4, m%5 ; x4x0 por m%3, m%6 ; x5x1 %else ; more dependency, one insn less. sometimes faster, sometimes not mova m%5, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pxor m%4, m%3 ; (x4^x1)x0 pand m%4, mask ; (x4^x1).. pxor m%3, m%4 ; x4x0 psrl%1 m%4, shift ; ..(x1^x4) pxor m%5, m%4 ; x5x1 SWAP %4, %3, %5 %endif %endmacro %macro TRANS_SSE4 5-6 ; see above %ifidn %1, d %ifidn %2, ord psrl%1 m%5, m%3, 16 pblendw m%5, m%4, q2222 psll%1 m%4, 16 pblendw m%4, m%3, q1111 SWAP %3, %5 %else %if avx_enabled pblendw m%5, m%3, m%4, q2222 SWAP %3, %5 %else mova m%5, m%3 pblendw m%3, m%4, q2222 %endif psll%1 m%4, 16 psrl%1 m%5, 16 por m%4, m%5 %endif %elifidn %1, q shufps m%5, m%3, m%4, q3131 shufps m%3, m%3, m%4, q2020 SWAP %4, %5 %endif %endmacro %macro TRANS_XOP 5-6 %ifidn %1, d vpperm m%5, m%3, m%4, [transd_shuf1] vpperm m%3, m%3, m%4, [transd_shuf2] %elifidn %1, q shufps m%5, m%3, m%4, q3131 shufps m%3, m%4, q2020 %endif SWAP %4, %5 %endmacro %macro HADAMARD 5-6 ; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes) ; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes) ; %3/%4: regs ; %5(%6): tmpregs %if %1!=0 ; have to reorder stuff for horizontal op %ifidn %2, sumsub %define ORDER ord ; sumsub needs order because a-b != b-a unless a=b %else %define ORDER unord ; if we just max, order doesn't matter (allows pblendw+or in sse4) %endif %if %1==1 TRANS d, ORDER, %3, %4, %5, %6 %elif %1==2 %if mmsize==8 SBUTTERFLY dq, %3, %4, %5 %elif %0==6 TRANS q, ORDER, %3, %4, %5, %6 %else TRANS q, ORDER, %3, %4, %5 %endif %elif %1==4 SBUTTERFLY qdq, %3, %4, %5 %elif %1==8 SBUTTERFLY dqqq, %3, %4, %5 %endif %endif %ifidn %2, sumsub SUMSUB_BA w, %3, %4, %5 %else %ifidn %2, amax %if %0==6 ABSW2 m%3, m%4, m%3, m%4, m%5, m%6 %else ABSW m%3, m%3, m%5 ABSW m%4, m%4, m%5 %endif %endif pmaxsw m%3, m%4 %endif %endmacro %macro HADAMARD2_2D 6-7 sumsub HADAMARD 0, sumsub, %1, %2, %5 HADAMARD 0, sumsub, %3, %4, %5 SBUTTERFLY %6, %1, %2, %5 %ifnum %7 HADAMARD 0, amax, %1, %2, %5, %7 %else HADAMARD 0, %7, %1, %2, %5 %endif SBUTTERFLY %6, %3, %4, %5 %ifnum %7 HADAMARD 0, amax, %3, %4, %5, %7 %else HADAMARD 0, %7, %3, %4, %5 %endif %endmacro %macro HADAMARD4_2D 5-6 sumsub HADAMARD2_2D %1, %2, %3, %4, %5, wd HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6 SWAP %2, %3 %endmacro %macro HADAMARD4_2D_SSE 5-6 sumsub HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1 HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3 SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0 SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2 HADAMARD2_2D %1, %3, %2, %4, %5, dq SBUTTERFLY qdq, %1, %2, %5 HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1 SBUTTERFLY qdq, %3, %4, %5 HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3 %endmacro %macro HADAMARD8_2D 9-10 sumsub HADAMARD2_2D %1, %2, %3, %4, %9, wd HADAMARD2_2D %5, %6, %7, %8, %9, wd HADAMARD2_2D %1, %3, %2, %4, %9, dq HADAMARD2_2D %5, %7, %6, %8, %9, dq HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10 HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10 %ifnidn %10, amax SWAP %2, %5 SWAP %4, %7 %endif %endmacro ; doesn't include the "pmaddubsw hmul_8p" pass %macro HADAMARD8_2D_HMUL 10 HADAMARD4_V %1, %2, %3, %4, %9 HADAMARD4_V %5, %6, %7, %8, %9 SUMSUB_BADC w, %1, %5, %2, %6, %9 HADAMARD 2, sumsub, %1, %5, %9, %10 HADAMARD 2, sumsub, %2, %6, %9, %10 SUMSUB_BADC w, %3, %7, %4, %8, %9 HADAMARD 2, sumsub, %3, %7, %9, %10 HADAMARD 2, sumsub, %4, %8, %9, %10 HADAMARD 1, amax, %1, %5, %9, %10 HADAMARD 1, amax, %2, %6, %9, %5 HADAMARD 1, amax, %3, %7, %9, %5 HADAMARD 1, amax, %4, %8, %9, %5 %endmacro %macro SUMSUB2_AB 4 %if cpuflag(xop) pmacs%1%1 m%4, m%3, [p%1_m2], m%2 pmacs%1%1 m%2, m%2, [p%1_2], m%3 %elifnum %3 psub%1 m%4, m%2, m%3 psub%1 m%4, m%3 padd%1 m%2, m%2 padd%1 m%2, m%3 %else mova m%4, m%2 padd%1 m%2, m%2 padd%1 m%2, %3 psub%1 m%4, %3 psub%1 m%4, %3 %endif %endmacro %macro SUMSUBD2_AB 5 %ifnum %4 psra%1 m%5, m%2, 1 ; %3: %3>>1 psra%1 m%4, m%3, 1 ; %2: %2>>1 padd%1 m%4, m%2 ; %3: %3>>1+%2 psub%1 m%5, m%3 ; %2: %2>>1-%3 SWAP %2, %5 SWAP %3, %4 %else mova %5, m%2 mova %4, m%3 psra%1 m%3, 1 ; %3: %3>>1 psra%1 m%2, 1 ; %2: %2>>1 padd%1 m%3, %5 ; %3: %3>>1+%2 psub%1 m%2, %4 ; %2: %2>>1-%3 %endif %endmacro %macro DCT4_1D 5 %ifnum %5 SUMSUB_BADC w, %4, %1, %3, %2, %5 SUMSUB_BA w, %3, %4, %5 SUMSUB2_AB w, %1, %2, %5 SWAP %1, %3, %4, %5, %2 %else SUMSUB_BADC w, %4, %1, %3, %2 SUMSUB_BA w, %3, %4 mova [%5], m%2 SUMSUB2_AB w, %1, [%5], %2 SWAP %1, %3, %4, %2 %endif %endmacro %macro IDCT4_1D 6-7 %ifnum %6 SUMSUBD2_AB %1, %3, %5, %7, %6 ; %3: %3>>1-%5 %5: %3+%5>>1 SUMSUB_BA %1, %4, %2, %7 ; %4: %2+%4 %2: %2-%4 SUMSUB_BADC %1, %5, %4, %3, %2, %7 ; %5: %2+%4 + (%3+%5>>1) ; %4: %2+%4 - (%3+%5>>1) ; %3: %2-%4 + (%3>>1-%5) ; %2: %2-%4 - (%3>>1-%5) %else %ifidn %1, w SUMSUBD2_AB %1, %3, %5, [%6], [%6+16] %else SUMSUBD2_AB %1, %3, %5, [%6], [%6+32] %endif SUMSUB_BA %1, %4, %2 SUMSUB_BADC %1, %5, %4, %3, %2 %endif SWAP %2, %5, %4 ; %2: %2+%4 + (%3+%5>>1) row0 ; %3: %2-%4 + (%3>>1-%5) row1 ; %4: %2-%4 - (%3>>1-%5) row2 ; %5: %2+%4 - (%3+%5>>1) row3 %endmacro %macro LOAD_DIFF 5-6 1 %if HIGH_BIT_DEPTH %if %6 ; %5 aligned? mova %1, %4 psubw %1, %5 %elif cpuflag(avx) movu %1, %4 psubw %1, %5 %else movu %1, %4 movu %2, %5 psubw %1, %2 %endif %else ; !HIGH_BIT_DEPTH movh %1, %4 movh %2, %5 %ifidn %3, none punpcklbw %1, %2 punpcklbw %2, %2 %else punpcklbw %1, %3 punpcklbw %2, %3 %endif psubw %1, %2 %endif ; HIGH_BIT_DEPTH %endmacro %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr %if BIT_DEPTH == 8 && cpuflag(ssse3) movh m%2, [%8+%1*FDEC_STRIDE] movh m%1, [%7+%1*FENC_STRIDE] punpcklbw m%1, m%2 movh m%3, [%8+%2*FDEC_STRIDE] movh m%2, [%7+%2*FENC_STRIDE] punpcklbw m%2, m%3 movh m%4, [%8+%3*FDEC_STRIDE] movh m%3, [%7+%3*FENC_STRIDE] punpcklbw m%3, m%4 movh m%5, [%8+%4*FDEC_STRIDE] movh m%4, [%7+%4*FENC_STRIDE] punpcklbw m%4, m%5 pmaddubsw m%1, m%6 pmaddubsw m%2, m%6 pmaddubsw m%3, m%6 pmaddubsw m%4, m%6 %else LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB] LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB] LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB] LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB] %endif %endmacro %macro STORE_DCT 6 movq [%5+%6+ 0], m%1 movq [%5+%6+ 8], m%2 movq [%5+%6+16], m%3 movq [%5+%6+24], m%4 movhps [%5+%6+32], m%1 movhps [%5+%6+40], m%2 movhps [%5+%6+48], m%3 movhps [%5+%6+56], m%4 %endmacro %macro STORE_IDCT 4 movhps [r0-4*FDEC_STRIDE], %1 movh [r0-3*FDEC_STRIDE], %1 movhps [r0-2*FDEC_STRIDE], %2 movh [r0-1*FDEC_STRIDE], %2 movhps [r0+0*FDEC_STRIDE], %3 movh [r0+1*FDEC_STRIDE], %3 movhps [r0+2*FDEC_STRIDE], %4 movh [r0+3*FDEC_STRIDE], %4 %endmacro %macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned? LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11 LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11 LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11 LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11 %if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3] %endif %endmacro ; 2xdst, 2xtmp, 2xsrcrow %macro LOAD_DIFF16x2_AVX2 6 pmovzxbw m%1, [r1+%5*FENC_STRIDE] pmovzxbw m%2, [r1+%6*FENC_STRIDE] pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE] pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE] psubw m%1, m%3 psubw m%2, m%4 %endmacro %macro DIFFx2 6-7 movh %3, %5 punpcklbw %3, %4 psraw %1, 6 paddsw %1, %3 movh %3, %6 punpcklbw %3, %4 psraw %2, 6 paddsw %2, %3 packuswb %2, %1 %endmacro ; (high depth) in: %1, %2, min to clip, max to clip, mem128 ; in: %1, tmp, %3, mem64 %macro STORE_DIFF 4-5 %if HIGH_BIT_DEPTH psrad %1, 6 psrad %2, 6 packssdw %1, %2 paddw %1, %5 CLIPW %1, %3, %4 mova %5, %1 %else movh %2, %4 punpcklbw %2, %3 psraw %1, 6 paddsw %1, %2 packuswb %1, %1 movh %4, %1 %endif %endmacro %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 db %1, %1 %else db %1*2 db %1*2+1 %endif %rotate 1 %endrep %endmacro ; instruction, accum, input, iteration (zero to swap, nonzero to add) %macro ACCUM 4 %if %4 %1 m%2, m%3 %else SWAP %2, %3 %endif %endmacro x264-master/config.guess000077500000000000000000001307201502133446700154020ustar00rootroot00000000000000#! /bin/sh # Attempt to guess a canonical system name. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, # 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, # 2011, 2012 Free Software Foundation, Inc. timestamp='2012-09-25' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Originally written by Per Bothner. Please send patches (context # diff format) to and include a ChangeLog # entry. # # This script attempts to guess a canonical system name similar to # config.sub. If it succeeds, it prints the system name on stdout, and # exits with 0. Otherwise, it exits with 1. # # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] Output the configuration name of the system \`$me' is run on. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" >&2 exit 1 ;; * ) break ;; esac done if test $# != 0; then echo "$me: too many arguments$help" >&2 exit 1 fi trap 'exit 1' 1 2 15 # CC_FOR_BUILD -- compiler used by this script. Note that the use of a # compiler to aid in system detection is discouraged as it requires # temporary files to be created and, as you can see below, it is a # headache to deal with in a portable fashion. # Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still # use `HOST_CC' if defined, but it is deprecated. # Portable tmp directory creation inspired by the Autoconf team. set_cc_for_build=' trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; : ${TMPDIR=/tmp} ; { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; dummy=$tmp/dummy ; tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; case $CC_FOR_BUILD,$HOST_CC,$CC in ,,) echo "int x;" > $dummy.c ; for c in cc gcc c89 c99 ; do if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then CC_FOR_BUILD="$c"; break ; fi ; done ; if test x"$CC_FOR_BUILD" = x ; then CC_FOR_BUILD=no_compiler_found ; fi ;; ,,*) CC_FOR_BUILD=$CC ;; ,*,*) CC_FOR_BUILD=$HOST_CC ;; esac ; set_cc_for_build= ;' # This is needed to find uname on a Pyramid OSx when run in the BSD universe. # (ghazi@noc.rutgers.edu 1994-08-24) if (test -f /.attbin/uname) >/dev/null 2>&1 ; then PATH=$PATH:/.attbin ; export PATH fi UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown # msys2 always reports `uname -m` as `x86_64` # but `$MSYSTEM_CARCH` reports the real toolchain target architecture of the msys2 environment # Link: https://github.com/msys2/msys2-runtime/issues/171 if test "$MSYSTEM_CARCH" != ""; then UNAME_MACHINE="$MSYSTEM_CARCH" fi # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently # switched to ELF, *-*-netbsd* would select the old # object file format. This provides both forward # compatibility and a consistent mechanism for selecting the # object file format. # # Note: NetBSD doesn't particularly care about the vendor # portion of the name. We always set it to "unknown". sysctl="sysctl -n hw.machine_arch" UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ /usr/sbin/$sysctl 2>/dev/null || echo unknown)` case "${UNAME_MACHINE_ARCH}" in armeb) machine=armeb-unknown ;; arm*) machine=arm-unknown ;; sh3el) machine=shl-unknown ;; sh3eb) machine=sh-unknown ;; sh5el) machine=sh5le-unknown ;; *) machine=${UNAME_MACHINE_ARCH}-unknown ;; esac # The Operating System including object format, if it has switched # to ELF recently, or will in the future. case "${UNAME_MACHINE_ARCH}" in arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ELF__ then # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). # Return netbsd for either. FIX? os=netbsd else os=netbsdelf fi ;; *) os=netbsd ;; esac # The OS release # Debian GNU/NetBSD machines have a different userland, and # thus, need a distinct triplet. However, they do not need # kernel version information, so it can be replaced with a # suitable tag, in the style of linux-gnu. case "${UNAME_VERSION}" in Debian*) release='-gnu' ;; *) release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` ;; esac # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: # contains redundant information, the shorter form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. echo "${machine}-${os}${release}" exit ;; *:Bitrig:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE} exit ;; *:OpenBSD:*:*) UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} exit ;; *:SolidBSD:*:*) echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} exit ;; macppc:MirBSD:*:*) echo powerpc-unknown-mirbsd${UNAME_RELEASE} exit ;; *:MirBSD:*:*) echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` ;; *5.*) UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ;; esac # According to Compaq, /usr/sbin/psrinfo has been available on # OSF/1 and Tru64 systems produced since 1995. I hope that # covers most systems running today. This code pipes the CPU # types through head -n 1, so we only detect the type of CPU 0. ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` case "$ALPHA_CPU_TYPE" in "EV4 (21064)") UNAME_MACHINE="alpha" ;; "EV4.5 (21064)") UNAME_MACHINE="alpha" ;; "LCA4 (21066/21068)") UNAME_MACHINE="alpha" ;; "EV5 (21164)") UNAME_MACHINE="alphaev5" ;; "EV5.6 (21164A)") UNAME_MACHINE="alphaev56" ;; "EV5.6 (21164PC)") UNAME_MACHINE="alphapca56" ;; "EV5.7 (21164PC)") UNAME_MACHINE="alphapca57" ;; "EV6 (21264)") UNAME_MACHINE="alphaev6" ;; "EV6.7 (21264A)") UNAME_MACHINE="alphaev67" ;; "EV6.8CB (21264C)") UNAME_MACHINE="alphaev68" ;; "EV6.8AL (21264B)") UNAME_MACHINE="alphaev68" ;; "EV6.8CX (21264D)") UNAME_MACHINE="alphaev68" ;; "EV6.9A (21264/EV69A)") UNAME_MACHINE="alphaev69" ;; "EV7 (21364)") UNAME_MACHINE="alphaev7" ;; "EV7.9 (21364A)") UNAME_MACHINE="alphaev79" ;; esac # A Pn.n version is a patched version. # A Vn.n version is a released version. # A Tn.n version is a released field test version. # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` # Reset EXIT trap before exiting to avoid spurious non-zero exit code. exitcode=$? trap '' 0 exit $exitcode ;; Alpha\ *:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # Should we change UNAME_MACHINE based on the output of uname instead # of the specific Alpha model? echo alpha-pc-interix exit ;; 21064:Windows_NT:50:3) echo alpha-dec-winnt3.5 exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-unknown-sysv4 exit ;; *:[Aa]miga[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-amigaos exit ;; *:[Mm]orph[Oo][Ss]:*:*) echo ${UNAME_MACHINE}-unknown-morphos exit ;; *:OS/390:*:*) echo i370-ibm-openedition exit ;; *:z/VM:*:*) echo s390-ibm-zvmoe exit ;; *:OS400:*:*) echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} exit ;; arm*:riscos:*:*|arm*:RISCOS:*:*) echo arm-unknown-riscos exit ;; SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) echo hppa1.1-hitachi-hiuxmpp exit ;; Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. if test "`(/bin/universe) 2>/dev/null`" = att ; then echo pyramid-pyramid-sysv3 else echo pyramid-pyramid-bsd fi exit ;; NILE*:*:*:dcosx) echo pyramid-pyramid-svr4 exit ;; DRS?6000:unix:4.0:6*) echo sparc-icl-nx6 exit ;; DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) case `/usr/bin/uname -p` in sparc) echo sparc-icl-nx7; exit ;; esac ;; s390x:SunOS:*:*) echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4H:SunOS:5.*:*) echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) echo i386-pc-auroraux${UNAME_RELEASE} exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) eval $set_cc_for_build SUN_ARCH="i386" # If there is a compiler, see if it is configured for 64-bit objects. # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. # This test works for both compilers. if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then SUN_ARCH="x86_64" fi fi echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:6*:*) # According to config.sub, this is the proper way to canonicalize # SunOS6. Hard to guess exactly what SunOS6 will be like, but # it's likely to be more like Solaris than SunOS4. echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; sun4*:SunOS:*:*) case "`/usr/bin/arch -k`" in Series*|S4*) UNAME_RELEASE=`uname -v` ;; esac # Japanese Language versions have a version number like `4.1.3-JL'. echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` exit ;; sun3*:SunOS:*:*) echo m68k-sun-sunos${UNAME_RELEASE} exit ;; sun*:*:4.2BSD:*) UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 case "`/bin/arch`" in sun3) echo m68k-sun-sunos${UNAME_RELEASE} ;; sun4) echo sparc-sun-sunos${UNAME_RELEASE} ;; esac exit ;; aushp:SunOS:*:*) echo sparc-auspex-sunos${UNAME_RELEASE} exit ;; # The situation for MiNT is a little confusing. The machine name # can be virtually everything (everything which is not # "atarist" or "atariste" at least should have a processor # > m68000). The system name ranges from "MiNT" over "FreeMiNT" # to the lowercase version "mint" (or "freemint"). Finally # the system name "TOS" denotes a system which is actually not # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) echo m68k-milan-mint${UNAME_RELEASE} exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) echo m68k-hades-mint${UNAME_RELEASE} exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) echo m68k-unknown-mint${UNAME_RELEASE} exit ;; m68k:machten:*:*) echo m68k-apple-machten${UNAME_RELEASE} exit ;; powerpc:machten:*:*) echo powerpc-apple-machten${UNAME_RELEASE} exit ;; RISC*:Mach:*:*) echo mips-dec-mach_bsd4.3 exit ;; RISC*:ULTRIX:*:*) echo mips-dec-ultrix${UNAME_RELEASE} exit ;; VAX*:ULTRIX*:*:*) echo vax-dec-ultrix${UNAME_RELEASE} exit ;; 2020:CLIX:*:* | 2430:CLIX:*:*) echo clipper-intergraph-clix${UNAME_RELEASE} exit ;; mips:*:*:UMIPS | mips:*:*:RISCos) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __cplusplus #include /* for printf() prototype */ int main (int argc, char *argv[]) { #else int main (argc, argv) int argc; char *argv[]; { #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); #endif #endif exit (-1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && SYSTEM_NAME=`$dummy $dummyarg` && { echo "$SYSTEM_NAME"; exit; } echo mips-mips-riscos${UNAME_RELEASE} exit ;; Motorola:PowerMAX_OS:*:*) echo powerpc-motorola-powermax exit ;; Motorola:*:4.3:PL8-*) echo powerpc-harris-powermax exit ;; Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) echo powerpc-harris-powermax exit ;; Night_Hawk:Power_UNIX:*:*) echo powerpc-harris-powerunix exit ;; m88k:CX/UX:7*:*) echo m88k-harris-cxux7 exit ;; m88k:*:4*:R4*) echo m88k-motorola-sysv4 exit ;; m88k:*:3*:R3*) echo m88k-motorola-sysv3 exit ;; AViiON:dgux:*:*) # DG/UX returns AViiON for all architectures UNAME_PROCESSOR=`/usr/bin/uname -p` if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] then if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ [ ${TARGET_BINARY_INTERFACE}x = x ] then echo m88k-dg-dgux${UNAME_RELEASE} else echo m88k-dg-dguxbcs${UNAME_RELEASE} fi else echo i586-dg-dgux${UNAME_RELEASE} fi exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) echo m88k-dolphin-sysv3 exit ;; M88*:*:R3*:*) # Delta 88k system running SVR3 echo m88k-motorola-sysv3 exit ;; XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) echo m88k-tektronix-sysv3 exit ;; Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) echo m68k-tektronix-bsd exit ;; *:IRIX*:*:*) echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` exit ;; ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' i*86:AIX:*:*) echo i386-ibm-aix exit ;; ia64:AIX:*:*) if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} exit ;; *:AIX:2:3) if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include main() { if (!__power_pc()) exit(1); puts("powerpc-ibm-aix3.2.5"); exit(0); } EOF if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` then echo "$SYSTEM_NAME" else echo rs6000-ibm-aix3.2.5 fi elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then echo rs6000-ibm-aix3.2.4 else echo rs6000-ibm-aix3.2 fi exit ;; *:AIX:*:[4567]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 else IBM_ARCH=powerpc fi if [ -x /usr/bin/oslevel ] ; then IBM_REV=`/usr/bin/oslevel` else IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} fi echo ${IBM_ARCH}-ibm-aix${IBM_REV} exit ;; *:AIX:*:*) echo rs6000-ibm-aix exit ;; ibmrt:4.4BSD:*|romp-ibm:BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to exit ;; # report: romp-ibm BSD 4.3 *:BOSX:*:*) echo rs6000-bull-bosx exit ;; DPX/2?00:B.O.S.:*:*) echo m68k-bull-sysv3 exit ;; 9000/[34]??:4.3bsd:1.*:*) echo m68k-hp-bsd exit ;; hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) echo m68k-hp-bsd4.4 exit ;; 9000/[34678]??:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` case "${UNAME_MACHINE}" in 9000/31? ) HP_ARCH=m68000 ;; 9000/[34]?? ) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` case "${sc_cpu_version}" in 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 532) # CPU_PA_RISC2_0 case "${sc_kernel_bits}" in 32) HP_ARCH="hppa2.0n" ;; 64) HP_ARCH="hppa2.0w" ;; '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 esac ;; esac fi if [ "${HP_ARCH}" = "" ]; then eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #define _HPUX_SOURCE #include #include int main () { #if defined(_SC_KERNEL_BITS) long bits = sysconf(_SC_KERNEL_BITS); #endif long cpu = sysconf (_SC_CPU_VERSION); switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0"); break; case CPU_PA_RISC1_1: puts ("hppa1.1"); break; case CPU_PA_RISC2_0: #if defined(_SC_KERNEL_BITS) switch (bits) { case 64: puts ("hppa2.0w"); break; case 32: puts ("hppa2.0n"); break; default: puts ("hppa2.0"); break; } break; #else /* !defined(_SC_KERNEL_BITS) */ puts ("hppa2.0"); break; #endif default: puts ("hppa1.0"); break; } exit (0); } EOF (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa fi ;; esac if [ ${HP_ARCH} = "hppa2.0w" ] then eval $set_cc_for_build # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler # generating 64-bit code. GNU and HP use different nomenclature: # # $ CC_FOR_BUILD=cc ./config.guess # => hppa2.0w-hp-hpux11.23 # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess # => hppa64-hp-hpux11.23 if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | grep -q __LP64__ then HP_ARCH="hppa2.0w" else HP_ARCH="hppa64" fi fi echo ${HP_ARCH}-hp-hpux${HPUX_REV} exit ;; ia64:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` echo ia64-hp-hpux${HPUX_REV} exit ;; 3050*:HI-UX:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #include int main () { long cpu = sysconf (_SC_CPU_VERSION); /* The order matters, because CPU_IS_HP_MC68K erroneously returns true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct results, however. */ if (CPU_IS_PA_RISC (cpu)) { switch (cpu) { case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; default: puts ("hppa-hitachi-hiuxwe2"); break; } } else if (CPU_IS_HP_MC68K (cpu)) puts ("m68k-hitachi-hiuxwe2"); else puts ("unknown-hitachi-hiuxwe2"); exit (0); } EOF $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) echo hppa1.0-hp-bsd exit ;; *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) echo hppa1.0-hp-osf exit ;; i*86:OSF1:*:*) if [ -x /usr/sbin/sysversion ] ; then echo ${UNAME_MACHINE}-unknown-osf1mk else echo ${UNAME_MACHINE}-unknown-osf1 fi exit ;; parisc*:Lites*:*:*) echo hppa1.1-hp-lites exit ;; C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) echo c1-convex-bsd exit ;; C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) echo c34-convex-bsd exit ;; C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) echo c38-convex-bsd exit ;; C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) echo c4-convex-bsd exit ;; CRAY*Y-MP:*:*:*) echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*[A-Z]90:*:*:*) echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ -e 's/\.[^.]*$/.X/' exit ;; CRAY*TS:*:*:*) echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*T3E:*:*:*) echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; CRAY*SV1:*:*:*) echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; *:UNICOS/mp:*:*) echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; 5000:UNIX_System_V:4.*:*) FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} exit ;; sparc*:BSD/OS:*:*) echo sparc-unknown-bsdi${UNAME_RELEASE} exit ;; *:BSD/OS:*:*) echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} exit ;; *:FreeBSD:*:*) UNAME_PROCESSOR=`/usr/bin/uname -p` case ${UNAME_PROCESSOR} in amd64) echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; *) echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; esac exit ;; i*:CYGWIN*:*) echo ${UNAME_MACHINE}-pc-cygwin exit ;; *:MINGW64*:*) echo ${UNAME_MACHINE}-pc-mingw64 exit ;; *:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; *:MSYS*:*) echo ${UNAME_MACHINE}-pc-msys exit ;; i*:windows32*:*) # uname -m includes "-pc" on this system. echo ${UNAME_MACHINE}-mingw32 exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; *:Interix*:*) case ${UNAME_MACHINE} in x86) echo i586-pc-interix${UNAME_RELEASE} exit ;; authenticamd | genuineintel | EM64T) echo x86_64-unknown-interix${UNAME_RELEASE} exit ;; IA64) echo ia64-unknown-interix${UNAME_RELEASE} exit ;; esac ;; [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) echo i${UNAME_MACHINE}-pc-mks exit ;; 8664:Windows_NT:*) echo x86_64-pc-mks exit ;; i*:Windows_NT*:* | Pentium*:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we # UNAME_MACHINE based on the output of uname instead of i386? echo i586-pc-interix exit ;; i*:UWIN*:*) echo ${UNAME_MACHINE}-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) echo x86_64-unknown-cygwin exit ;; p*:CYGWIN*:*) echo powerpcle-unknown-cygwin exit ;; prep*:SunOS:5.*:*) echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; *:GNU:*:*) # the GNU system echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in EV5) UNAME_MACHINE=alphaev5 ;; EV56) UNAME_MACHINE=alphaev56 ;; PCA56) UNAME_MACHINE=alphapca56 ;; PCA57) UNAME_MACHINE=alphapca56 ;; EV6) UNAME_MACHINE=alphaev6 ;; EV67) UNAME_MACHINE=alphaev67 ;; EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then echo ${UNAME_MACHINE}-unknown-linux-gnu else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then echo ${UNAME_MACHINE}-unknown-linux-gnueabi else echo ${UNAME_MACHINE}-unknown-linux-gnueabihf fi fi exit ;; avr32*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; cris:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; crisv32:Linux:*:*) echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; frv:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; hexagon:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; i*86:Linux:*:*) LIBC=gnu eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #ifdef __dietlibc__ LIBC=dietlibc #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` echo "${UNAME_MACHINE}-pc-linux-${LIBC}" exit ;; ia64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; loongarch32:Linux:*:* | loongarch64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m32r*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; m68*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU #undef ${UNAME_MACHINE} #undef ${UNAME_MACHINE}el #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) CPU=${UNAME_MACHINE}el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) CPU=${UNAME_MACHINE} #else CPU= #endif #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } ;; or32:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; padre:Linux:*:*) echo sparc-unknown-linux-gnu exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) echo hppa64-unknown-linux-gnu exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in PA7*) echo hppa1.1-unknown-linux-gnu ;; PA8*) echo hppa2.0-unknown-linux-gnu ;; *) echo hppa-unknown-linux-gnu ;; esac exit ;; ppc64:Linux:*:*) echo powerpc64-unknown-linux-gnu exit ;; ppc64le:Linux:*:*) echo powerpc64le-unknown-linux-gnu exit ;; ppc:Linux:*:*) echo powerpc-unknown-linux-gnu exit ;; riscv64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; s390:Linux:*:* | s390x:Linux:*:*) echo ${UNAME_MACHINE}-ibm-linux exit ;; sh64*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sh*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; tile*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; vax:Linux:*:*) echo ${UNAME_MACHINE}-dec-linux-gnu exit ;; x86_64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; xtensa*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. # earlier versions are messed up and put the nodename in both # sysname and nodename. echo i386-sequent-sysv4 exit ;; i*86:UNIX_SV:4.2MP:2.*) # Unixware is an offshoot of SVR4, but it has its own version # number series starting with 2... # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. # Use sysv4.2uw... so that sysv4* matches it. echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} exit ;; i*86:OS/2:*:*) # If we were able to find `uname', then EMX Unix compatibility # is probably installed. echo ${UNAME_MACHINE}-pc-os2-emx exit ;; i*86:XTS-300:*:STOP) echo ${UNAME_MACHINE}-unknown-stop exit ;; i*86:atheos:*:*) echo ${UNAME_MACHINE}-unknown-atheos exit ;; i*86:syllable:*:*) echo ${UNAME_MACHINE}-pc-syllable exit ;; i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) echo i386-unknown-lynxos${UNAME_RELEASE} exit ;; i*86:*DOS:*:*) echo ${UNAME_MACHINE}-pc-msdosdjgpp exit ;; i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} else echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} fi exit ;; i*86:*:5:[678]*) # UnixWare 7.x, OpenUNIX and OpenServer 6. case `/bin/uname -X | grep "^Machine"` in *486*) UNAME_MACHINE=i486 ;; *Pentium) UNAME_MACHINE=i586 ;; *Pent*|*Celeron) UNAME_MACHINE=i686 ;; esac echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} exit ;; i*86:*:3.2:*) if test -f /usr/options/cb.name; then UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ && UNAME_MACHINE=i586 (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ && UNAME_MACHINE=i686 (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ && UNAME_MACHINE=i686 echo ${UNAME_MACHINE}-pc-sco$UNAME_REL else echo ${UNAME_MACHINE}-pc-sysv32 fi exit ;; pc:*:*:*) # Left here for compatibility: # uname -m prints for DJGPP always 'pc', but it prints nothing about # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub # prints for the "djgpp" host, or else GDB configure will decide that # this is a cross-build. echo i586-pc-msdosdjgpp exit ;; Intel:Mach:3*:*) echo i386-pc-mach3 exit ;; paragon:*:*:*) echo i860-intel-osf1 exit ;; i860:*:4.*:*) # i860-SVR4 if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 else # Add other i860-SVR4 vendors below as they are discovered. echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 fi exit ;; mini*:CTIX:SYS*5:*) # "miniframe" echo m68010-convergent-sysv exit ;; mc68k:UNIX:SYSTEM5:3.51m) echo m68k-convergent-sysv exit ;; M680?0:D-NIX:5.3:*) echo m68k-diab-dnix exit ;; M68*:*:R3V[5678]*:*) test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) OS_REL='' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4; exit; } ;; NCR*:*:4.2:* | MPRAS*:*:4.2:*) OS_REL='.3' test -r /etc/.relid \ && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ && { echo i486-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) echo m68k-unknown-lynxos${UNAME_RELEASE} exit ;; mc68030:UNIX_System_V:4.*:*) echo m68k-atari-sysv4 exit ;; TSUNAMI:LynxOS:2.*:*) echo sparc-unknown-lynxos${UNAME_RELEASE} exit ;; rs6000:LynxOS:2.*:*) echo rs6000-unknown-lynxos${UNAME_RELEASE} exit ;; PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) echo powerpc-unknown-lynxos${UNAME_RELEASE} exit ;; SM[BE]S:UNIX_SV:*:*) echo mips-dde-sysv${UNAME_RELEASE} exit ;; RM*:ReliantUNIX-*:*:*) echo mips-sni-sysv4 exit ;; RM*:SINIX-*:*:*) echo mips-sni-sysv4 exit ;; *:SINIX-*:*:*) if uname -p 2>/dev/null >/dev/null ; then UNAME_MACHINE=`(uname -p) 2>/dev/null` echo ${UNAME_MACHINE}-sni-sysv4 else echo ns32k-sni-sysv fi exit ;; PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort # says echo i586-unisys-sysv4 exit ;; *:UNIX_System_V:4*:FTX*) # From Gerald Hewes . # How about differentiating between stratus architectures? -djm echo hppa1.1-stratus-sysv4 exit ;; *:*:*:FTX*) # From seanf@swdc.stratus.com. echo i860-stratus-sysv4 exit ;; i*86:VOS:*:*) # From Paul.Green@stratus.com. echo ${UNAME_MACHINE}-stratus-vos exit ;; *:VOS:*:*) # From Paul.Green@stratus.com. echo hppa1.1-stratus-vos exit ;; mc68*:A/UX:*:*) echo m68k-apple-aux${UNAME_RELEASE} exit ;; news*:NEWS-OS:6*:*) echo mips-sony-newsos6 exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then echo mips-nec-sysv${UNAME_RELEASE} else echo mips-unknown-sysv${UNAME_RELEASE} fi exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. echo powerpc-be-beos exit ;; BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. echo powerpc-apple-beos exit ;; BePC:BeOS:*:*) # BeOS running on Intel PC compatible. echo i586-pc-beos exit ;; BePC:Haiku:*:*) # Haiku running on Intel PC compatible. echo i586-pc-haiku exit ;; x86_64:Haiku:*:*) echo x86_64-unknown-haiku exit ;; SX-4:SUPER-UX:*:*) echo sx4-nec-superux${UNAME_RELEASE} exit ;; SX-5:SUPER-UX:*:*) echo sx5-nec-superux${UNAME_RELEASE} exit ;; SX-6:SUPER-UX:*:*) echo sx6-nec-superux${UNAME_RELEASE} exit ;; SX-7:SUPER-UX:*:*) echo sx7-nec-superux${UNAME_RELEASE} exit ;; SX-8:SUPER-UX:*:*) echo sx8-nec-superux${UNAME_RELEASE} exit ;; SX-8R:SUPER-UX:*:*) echo sx8r-nec-superux${UNAME_RELEASE} exit ;; Power*:Rhapsody:*:*) echo powerpc-apple-rhapsody${UNAME_RELEASE} exit ;; *:Rhapsody:*:*) echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} exit ;; arm64:Darwin:*:*) echo arm64-apple-darwin${UNAME_RELEASE} exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown case $UNAME_PROCESSOR in i386) eval $set_cc_for_build if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ grep IS_64BIT_ARCH >/dev/null then UNAME_PROCESSOR="x86_64" fi fi ;; unknown) UNAME_PROCESSOR=powerpc ;; esac echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) UNAME_PROCESSOR=`uname -p` if test "$UNAME_PROCESSOR" = "x86"; then UNAME_PROCESSOR=i386 UNAME_MACHINE=pc fi echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} exit ;; *:QNX:*:4*) echo i386-pc-qnx exit ;; NEO-?:NONSTOP_KERNEL:*:*) echo neo-tandem-nsk${UNAME_RELEASE} exit ;; NSE-*:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; NSR-?:NONSTOP_KERNEL:*:*) echo nsr-tandem-nsk${UNAME_RELEASE} exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux exit ;; BS2000:POSIX*:*:*) echo bs2000-siemens-sysv exit ;; DS/*:UNIX_System_V:*:*) echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} exit ;; *:Plan9:*:*) # "uname -m" is not consistent, so use $cputype instead. 386 # is converted to i386 for consistency with other x86 # operating systems. if test "$cputype" = "386"; then UNAME_MACHINE=i386 else UNAME_MACHINE="$cputype" fi echo ${UNAME_MACHINE}-unknown-plan9 exit ;; *:TOPS-10:*:*) echo pdp10-unknown-tops10 exit ;; *:TENEX:*:*) echo pdp10-unknown-tenex exit ;; KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) echo pdp10-dec-tops20 exit ;; XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) echo pdp10-xkl-tops20 exit ;; *:TOPS-20:*:*) echo pdp10-unknown-tops20 exit ;; *:ITS:*:*) echo pdp10-unknown-its exit ;; SEI:*:*:SEIUX) echo mips-sei-seiux${UNAME_RELEASE} exit ;; *:DragonFly:*:*) echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; *:*VMS:*:*) UNAME_MACHINE=`(uname -p) 2>/dev/null` case "${UNAME_MACHINE}" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; V*) echo vax-dec-vms ; exit ;; esac ;; *:XENIX:*:SysV) echo i386-pc-xenix exit ;; i*86:skyos:*:*) echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' exit ;; i*86:rdos:*:*) echo ${UNAME_MACHINE}-pc-rdos exit ;; i*86:AROS:*:*) echo ${UNAME_MACHINE}-pc-aros exit ;; x86_64:VMkernel:*:*) echo ${UNAME_MACHINE}-unknown-esx exit ;; esac eval $set_cc_for_build cat >$dummy.c < # include #endif main () { #if defined (sony) #if defined (MIPSEB) /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, I don't know.... */ printf ("mips-sony-bsd\n"); exit (0); #else #include printf ("m68k-sony-newsos%s\n", #ifdef NEWSOS4 "4" #else "" #endif ); exit (0); #endif #endif #if defined (__arm) && defined (__acorn) && defined (__unix) printf ("arm-acorn-riscix\n"); exit (0); #endif #if defined (hp300) && !defined (hpux) printf ("m68k-hp-bsd\n"); exit (0); #endif #if defined (NeXT) #if !defined (__ARCHITECTURE__) #define __ARCHITECTURE__ "m68k" #endif int version; version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; if (version < 4) printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); else printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); exit (0); #endif #if defined (MULTIMAX) || defined (n16) #if defined (UMAXV) printf ("ns32k-encore-sysv\n"); exit (0); #else #if defined (CMU) printf ("ns32k-encore-mach\n"); exit (0); #else printf ("ns32k-encore-bsd\n"); exit (0); #endif #endif #endif #if defined (__386BSD__) printf ("i386-pc-bsd\n"); exit (0); #endif #if defined (sequent) #if defined (i386) printf ("i386-sequent-dynix\n"); exit (0); #endif #if defined (ns32000) printf ("ns32k-sequent-dynix\n"); exit (0); #endif #endif #if defined (_SEQUENT_) struct utsname un; uname(&un); if (strncmp(un.version, "V2", 2) == 0) { printf ("i386-sequent-ptx2\n"); exit (0); } if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ printf ("i386-sequent-ptx1\n"); exit (0); } printf ("i386-sequent-ptx\n"); exit (0); #endif #if defined (vax) # if !defined (ultrix) # include # if defined (BSD) # if BSD == 43 printf ("vax-dec-bsd4.3\n"); exit (0); # else # if BSD == 199006 printf ("vax-dec-bsd4.3reno\n"); exit (0); # else printf ("vax-dec-bsd\n"); exit (0); # endif # endif # else printf ("vax-dec-bsd\n"); exit (0); # endif # else printf ("vax-dec-ultrix\n"); exit (0); # endif #endif #if defined (alliant) && defined (i860) printf ("i860-alliant-bsd\n"); exit (0); #endif exit (1); } EOF $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && { echo "$SYSTEM_NAME"; exit; } # Apollos put the system type in the environment. test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } # Convex versions that predate uname can use getsysinfo(1) if [ -x /usr/convex/getsysinfo ] then case `getsysinfo -f cpu_type` in c1*) echo c1-convex-bsd exit ;; c2*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi exit ;; c34*) echo c34-convex-bsd exit ;; c38*) echo c38-convex-bsd exit ;; c4*) echo c4-convex-bsd exit ;; esac fi cat >&2 < in order to provide the needed information to handle your system. config.guess timestamp = $timestamp uname -m = `(uname -m) 2>/dev/null || echo unknown` uname -r = `(uname -r) 2>/dev/null || echo unknown` uname -s = `(uname -s) 2>/dev/null || echo unknown` uname -v = `(uname -v) 2>/dev/null || echo unknown` /usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` /bin/uname -X = `(/bin/uname -X) 2>/dev/null` hostinfo = `(hostinfo) 2>/dev/null` /bin/universe = `(/bin/universe) 2>/dev/null` /usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` /bin/arch = `(/bin/arch) 2>/dev/null` /usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` /usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` UNAME_MACHINE = ${UNAME_MACHINE} UNAME_RELEASE = ${UNAME_RELEASE} UNAME_SYSTEM = ${UNAME_SYSTEM} UNAME_VERSION = ${UNAME_VERSION} EOF exit 1 # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: x264-master/config.sub000077500000000000000000001060031502133446700150420ustar00rootroot00000000000000#! /bin/sh # Configuration validation subroutine script. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, # 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, # 2011, 2012 Free Software Foundation, Inc. timestamp='2012-12-06' # This file is (in principle) common to ALL GNU software. # The presence of a machine in this file suggests that SOME GNU software # can handle that machine. It does not imply ALL GNU software can. # # This file is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under # the same distribution terms that you use for the rest of that program. # Please send patches to . Submit a context # diff and a properly formatted GNU ChangeLog entry. # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. # If it is invalid, we print an error message on stderr and exit with code 1. # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases # that are meaningful with *any* GNU software. # Each package is responsible for reporting which valid configurations # it does not support. The user should be able to distinguish # a failure to support a valid configuration from a meaningless # configuration. # The goal of this file is to map all the various variations of a given # machine specification into a single specification in the form: # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM # or in some cases, the newer four-part form: # CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM # It is wrong to echo any other type of specification. me=`echo "$0" | sed -e 's,.*/,,'` usage="\ Usage: $0 [OPTION] CPU-MFR-OPSYS $0 [OPTION] ALIAS Canonicalize a configuration name. Operation modes: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit Report bugs and patches to ." version="\ GNU config.sub ($timestamp) Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." help=" Try \`$me --help' for more information." # Parse command line while test $# -gt 0 ; do case $1 in --time-stamp | --time* | -t ) echo "$timestamp" ; exit ;; --version | -v ) echo "$version" ; exit ;; --help | --h* | -h ) echo "$usage"; exit ;; -- ) # Stop option processing shift; break ;; - ) # Use stdin as input. break ;; -* ) echo "$me: invalid option $1$help" exit 1 ;; *local*) # First pass through any local machine types. echo $1 exit ;; * ) break ;; esac done case $# in 0) echo "$me: missing argument$help" >&2 exit 1;; 1) ;; *) echo "$me: too many arguments$help" >&2 exit 1;; esac # Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). # Here we must recognize all the valid KERNEL-OS combinations. maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` case $maybe_os in nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \ linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \ knetbsd*-gnu* | netbsd*-gnu* | \ kopensolaris*-gnu* | \ storm-chaos* | os2-emx* | rtmk-nova*) os=-$maybe_os basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` ;; android-linux) os=-linux-android basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown ;; *) basic_machine=`echo $1 | sed 's/-[^-]*$//'` if [ $basic_machine != $1 ] then os=`echo $1 | sed 's/.*-/-/'` else os=; fi ;; esac ### Let's recognize common machines as not being operating systems so ### that things like config.sub decstation-3100 work. We also ### recognize some manufacturers as not being operating systems, so we ### can provide default operating systems below. case $os in -sun*os*) # Prevent following clause from handling this invalid input. ;; -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ -apple | -axis | -knuth | -cray | -microblaze*) os= basic_machine=$1 ;; -bluegene*) os=-cnk ;; -sim | -cisco | -oki | -wec | -winbond) os= basic_machine=$1 ;; -scout) ;; -wrs) os=-vxworks basic_machine=$1 ;; -chorusos*) os=-chorusos basic_machine=$1 ;; -chorusrdb) os=-chorusrdb basic_machine=$1 ;; -hiux*) os=-hiuxwe2 ;; -sco6) os=-sco5v6 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5) os=-sco3.2v5 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco4) os=-sco3.2v4 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2.[4-9]*) os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco3.2v[4-9]*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco5v6*) # Don't forget version if it is 3.2v4 or newer. basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -sco*) os=-sco3.2v2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -udk*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -isc) os=-isc2.2 basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -clix*) basic_machine=clipper-intergraph ;; -isc*) basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` ;; -lynx*178) os=-lynxos178 ;; -lynx*5) os=-lynxos5 ;; -lynx*) os=-lynxos ;; -ptx*) basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` ;; -windowsnt*) os=`echo $os | sed -e 's/windowsnt/winnt/'` ;; -psos*) os=-psos ;; -mint | -mint[0-9]*) basic_machine=m68k-atari os=-mint ;; esac # Decode aliases for certain CPU-COMPANY combinations. case $basic_machine in # Recognize the basic CPU types without company name. # Some are omitted here because they have special meanings below. 1750a | 580 \ | a29k \ | aarch64 | aarch64_be | arm64 \ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ | arc \ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ | avr | avr32 \ | be32 | be64 \ | bfin \ | c4x | clipper \ | d10v | d30v | dlx | dsp16xx \ | epiphany \ | fido | fr30 | frv \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ | i370 | i860 | i960 | ia64 \ | ip2k | iq2000 \ | le32 | le64 \ | lm32 \ | loongarch32 | loongarch64 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ | mips16 \ | mips64 | mips64el \ | mips64octeon | mips64octeonel \ | mips64orion | mips64orionel \ | mips64r5900 | mips64r5900el \ | mips64vr | mips64vrel \ | mips64vr4100 | mips64vr4100el \ | mips64vr4300 | mips64vr4300el \ | mips64vr5000 | mips64vr5000el \ | mips64vr5900 | mips64vr5900el \ | mipsisa32 | mipsisa32el \ | mipsisa32r2 | mipsisa32r2el \ | mipsisa64 | mipsisa64el \ | mipsisa64r2 | mipsisa64r2el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ | nios | nios2 \ | ns16k | ns32k \ | open8 \ | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pyramid \ | rl78 | rx \ | score \ | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ | sh64 | sh64le \ | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ | spu \ | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ | we32k \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) basic_machine=$basic_machine-unknown ;; c54x) basic_machine=tic54x-unknown ;; c55x) basic_machine=tic55x-unknown ;; c6x) basic_machine=tic6x-unknown ;; m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip) basic_machine=$basic_machine-unknown os=-none ;; m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) ;; ms1) basic_machine=mt-unknown ;; strongarm | thumb | xscale) basic_machine=arm-unknown ;; xgate) basic_machine=$basic_machine-unknown os=-none ;; xscaleeb) basic_machine=armeb-unknown ;; xscaleel) basic_machine=armel-unknown ;; # We use `pc' rather than `unknown' # because (1) that's what they normally are, and # (2) the word "unknown" tends to confuse beginning users. i*86 | x86_64) basic_machine=$basic_machine-pc ;; # Object if more than one company name word. *-*-*) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; # Recognize the basic CPU types with company name. 580-* \ | a29k-* \ | aarch64-* | aarch64_be-* | arm64*-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ | i*86-* | i860-* | i960-* | ia64-* \ | ip2k-* | iq2000-* \ | le32-* | le64-* \ | lm32-* \ | loongarch32-* | loongarch64-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ | microblaze-* | microblazeel-* \ | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ | mips16-* \ | mips64-* | mips64el-* \ | mips64octeon-* | mips64octeonel-* \ | mips64orion-* | mips64orionel-* \ | mips64r5900-* | mips64r5900el-* \ | mips64vr-* | mips64vrel-* \ | mips64vr4100-* | mips64vr4100el-* \ | mips64vr4300-* | mips64vr4300el-* \ | mips64vr5000-* | mips64vr5000el-* \ | mips64vr5900-* | mips64vr5900el-* \ | mipsisa32-* | mipsisa32el-* \ | mipsisa32r2-* | mipsisa32r2el-* \ | mipsisa64-* | mipsisa64el-* \ | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ | nios-* | nios2-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ | orion-* \ | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \ | pyramid-* \ | rl78-* | romp-* | rs6000-* | rx-* \ | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ | sparclite-* \ | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \ | tahoe-* \ | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ | tile*-* \ | tron-* \ | ubicom32-* \ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ | ymp-* \ | z8k-* | z80-*) ;; # Recognize the basic CPU types without company name, with glob match. xtensa*) basic_machine=$basic_machine-unknown ;; # Recognize the various machine names and aliases which stand # for a CPU type and a company and sometimes even an OS. 386bsd) basic_machine=i386-unknown os=-bsd ;; 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) basic_machine=m68000-att ;; 3b*) basic_machine=we32k-att ;; a29khif) basic_machine=a29k-amd os=-udi ;; abacus) basic_machine=abacus-unknown ;; adobe68k) basic_machine=m68010-adobe os=-scout ;; alliant | fx80) basic_machine=fx80-alliant ;; altos | altos3068) basic_machine=m68k-altos ;; am29k) basic_machine=a29k-none os=-bsd ;; amd64) basic_machine=x86_64-pc ;; amd64-*) basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; amdahl) basic_machine=580-amdahl os=-sysv ;; amiga | amiga-*) basic_machine=m68k-unknown ;; amigaos | amigados) basic_machine=m68k-unknown os=-amigaos ;; amigaunix | amix) basic_machine=m68k-unknown os=-sysv4 ;; apollo68) basic_machine=m68k-apollo os=-sysv ;; apollo68bsd) basic_machine=m68k-apollo os=-bsd ;; aros) basic_machine=i386-pc os=-aros ;; aux) basic_machine=m68k-apple os=-aux ;; balance) basic_machine=ns32k-sequent os=-dynix ;; blackfin) basic_machine=bfin-unknown os=-linux ;; blackfin-*) basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; bluegene*) basic_machine=powerpc-ibm os=-cnk ;; c54x-*) basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c55x-*) basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c6x-*) basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'` ;; c90) basic_machine=c90-cray os=-unicos ;; cegcc) basic_machine=arm-unknown os=-cegcc ;; convex-c1) basic_machine=c1-convex os=-bsd ;; convex-c2) basic_machine=c2-convex os=-bsd ;; convex-c32) basic_machine=c32-convex os=-bsd ;; convex-c34) basic_machine=c34-convex os=-bsd ;; convex-c38) basic_machine=c38-convex os=-bsd ;; cray | j90) basic_machine=j90-cray os=-unicos ;; craynv) basic_machine=craynv-cray os=-unicosmp ;; cr16 | cr16-*) basic_machine=cr16-unknown os=-elf ;; crds | unos) basic_machine=m68k-crds ;; crisv32 | crisv32-* | etraxfs*) basic_machine=crisv32-axis ;; cris | cris-* | etrax*) basic_machine=cris-axis ;; crx) basic_machine=crx-unknown os=-elf ;; da30 | da30-*) basic_machine=m68k-da30 ;; decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) basic_machine=mips-dec ;; decsystem10* | dec10*) basic_machine=pdp10-dec os=-tops10 ;; decsystem20* | dec20*) basic_machine=pdp10-dec os=-tops20 ;; delta | 3300 | motorola-3300 | motorola-delta \ | 3300-motorola | delta-motorola) basic_machine=m68k-motorola ;; delta88) basic_machine=m88k-motorola os=-sysv3 ;; dicos) basic_machine=i686-pc os=-dicos ;; djgpp) basic_machine=i586-pc os=-msdosdjgpp ;; dpx20 | dpx20-*) basic_machine=rs6000-bull os=-bosx ;; dpx2* | dpx2*-bull) basic_machine=m68k-bull os=-sysv3 ;; ebmon29k) basic_machine=a29k-amd os=-ebmon ;; elxsi) basic_machine=elxsi-elxsi os=-bsd ;; encore | umax | mmax) basic_machine=ns32k-encore ;; es1800 | OSE68k | ose68k | ose | OSE) basic_machine=m68k-ericsson os=-ose ;; fx2800) basic_machine=i860-alliant ;; genix) basic_machine=ns32k-ns ;; gmicro) basic_machine=tron-gmicro os=-sysv ;; go32) basic_machine=i386-pc os=-go32 ;; h3050r* | hiux*) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; h8300hms) basic_machine=h8300-hitachi os=-hms ;; h8300xray) basic_machine=h8300-hitachi os=-xray ;; h8500hms) basic_machine=h8500-hitachi os=-hms ;; harris) basic_machine=m88k-harris os=-sysv3 ;; hp300-*) basic_machine=m68k-hp ;; hp300bsd) basic_machine=m68k-hp os=-bsd ;; hp300hpux) basic_machine=m68k-hp os=-hpux ;; hp3k9[0-9][0-9] | hp9[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k2[0-9][0-9] | hp9k31[0-9]) basic_machine=m68000-hp ;; hp9k3[2-9][0-9]) basic_machine=m68k-hp ;; hp9k6[0-9][0-9] | hp6[0-9][0-9]) basic_machine=hppa1.0-hp ;; hp9k7[0-79][0-9] | hp7[0-79][0-9]) basic_machine=hppa1.1-hp ;; hp9k78[0-9] | hp78[0-9]) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) # FIXME: really hppa2.0-hp basic_machine=hppa1.1-hp ;; hp9k8[0-9][13679] | hp8[0-9][13679]) basic_machine=hppa1.1-hp ;; hp9k8[0-9][0-9] | hp8[0-9][0-9]) basic_machine=hppa1.0-hp ;; hppa-next) os=-nextstep3 ;; hppaosf) basic_machine=hppa1.1-hp os=-osf ;; hppro) basic_machine=hppa1.1-hp os=-proelf ;; i370-ibm* | ibm*) basic_machine=i370-ibm ;; i*86v32) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv32 ;; i*86v4*) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv4 ;; i*86v) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-sysv ;; i*86sol2) basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` os=-solaris2 ;; i386mach) basic_machine=i386-mach os=-mach ;; i386-vsta | vsta) basic_machine=i386-unknown os=-vsta ;; iris | iris4d) basic_machine=mips-sgi case $os in -irix*) ;; *) os=-irix4 ;; esac ;; isi68 | isi) basic_machine=m68k-isi os=-sysv ;; m68knommu) basic_machine=m68k-unknown os=-linux ;; m68knommu-*) basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; m88k-omron*) basic_machine=m88k-omron ;; magnum | m3230) basic_machine=mips-mips os=-sysv ;; merlin) basic_machine=ns32k-utek os=-sysv ;; microblaze*) basic_machine=microblaze-xilinx ;; mingw64) basic_machine=x86_64-pc os=-mingw64 ;; mingw32) basic_machine=i386-pc os=-mingw32 ;; mingw32ce) basic_machine=arm-unknown os=-mingw32ce ;; miniframe) basic_machine=m68000-convergent ;; *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) basic_machine=m68k-atari os=-mint ;; mips3*-*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` ;; mips3*) basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown ;; monitor) basic_machine=m68k-rom68k os=-coff ;; morphos) basic_machine=powerpc-unknown os=-morphos ;; msdos) basic_machine=i386-pc os=-msdos ;; ms1-*) basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) basic_machine=i386-pc os=-msys ;; mvs) basic_machine=i370-ibm os=-mvs ;; nacl) basic_machine=le32-unknown os=-nacl ;; ncr3000) basic_machine=i486-ncr os=-sysv4 ;; netbsd386) basic_machine=i386-unknown os=-netbsd ;; netwinder) basic_machine=armv4l-rebel os=-linux ;; news | news700 | news800 | news900) basic_machine=m68k-sony os=-newsos ;; news1000) basic_machine=m68030-sony os=-newsos ;; news-3600 | risc-news) basic_machine=mips-sony os=-newsos ;; necv70) basic_machine=v70-nec os=-sysv ;; next | m*-next ) basic_machine=m68k-next case $os in -nextstep* ) ;; -ns2*) os=-nextstep2 ;; *) os=-nextstep3 ;; esac ;; nh3000) basic_machine=m68k-harris os=-cxux ;; nh[45]000) basic_machine=m88k-harris os=-cxux ;; nindy960) basic_machine=i960-intel os=-nindy ;; mon960) basic_machine=i960-intel os=-mon960 ;; nonstopux) basic_machine=mips-compaq os=-nonstopux ;; np1) basic_machine=np1-gould ;; neo-tandem) basic_machine=neo-tandem ;; nse-tandem) basic_machine=nse-tandem ;; nsr-tandem) basic_machine=nsr-tandem ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf ;; openrisc | openrisc-*) basic_machine=or32-unknown ;; os400) basic_machine=powerpc-ibm os=-os400 ;; OSE68000 | ose68000) basic_machine=m68000-ericsson os=-ose ;; os68k) basic_machine=m68k-none os=-os68k ;; pa-hitachi) basic_machine=hppa1.1-hitachi os=-hiuxwe2 ;; paragon) basic_machine=i860-intel os=-osf ;; parisc) basic_machine=hppa-unknown os=-linux ;; parisc-*) basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'` os=-linux ;; pbd) basic_machine=sparc-tti ;; pbb) basic_machine=m68k-tti ;; pc532 | pc532-*) basic_machine=ns32k-pc532 ;; pc98) basic_machine=i386-pc ;; pc98-*) basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium | p5 | k5 | k6 | nexgen | viac3) basic_machine=i586-pc ;; pentiumpro | p6 | 6x86 | athlon | athlon_*) basic_machine=i686-pc ;; pentiumii | pentium2 | pentiumiii | pentium3) basic_machine=i686-pc ;; pentium4) basic_machine=i786-pc ;; pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumpro-* | p6-* | 6x86-* | athlon-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pentium4-*) basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` ;; pn) basic_machine=pn-gould ;; power) basic_machine=power-ibm ;; ppc | ppcbe) basic_machine=powerpc-unknown ;; ppc-* | ppcbe-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppcle | powerpclittle | ppc-le | powerpc-little) basic_machine=powerpcle-unknown ;; ppcle-* | powerpclittle-*) basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64) basic_machine=powerpc64-unknown ;; ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ppc64le | powerpc64little | ppc64-le | powerpc64-little) basic_machine=powerpc64le-unknown ;; ppc64le-* | powerpc64little-*) basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` ;; ps2) basic_machine=i386-ibm ;; pw32) basic_machine=i586-unknown os=-pw32 ;; rdos | rdos64) basic_machine=x86_64-pc os=-rdos ;; rdos32) basic_machine=i386-pc os=-rdos ;; rom68k) basic_machine=m68k-rom68k os=-coff ;; rm[46]00) basic_machine=mips-siemens ;; rtpc | rtpc-*) basic_machine=romp-ibm ;; s390 | s390-*) basic_machine=s390-ibm ;; s390x | s390x-*) basic_machine=s390x-ibm ;; sa29200) basic_machine=a29k-amd os=-udi ;; sb1) basic_machine=mipsisa64sb1-unknown ;; sb1el) basic_machine=mipsisa64sb1el-unknown ;; sde) basic_machine=mipsisa32-sde os=-elf ;; sei) basic_machine=mips-sei os=-seiux ;; sequent) basic_machine=i386-sequent ;; sh) basic_machine=sh-hitachi os=-hms ;; sh5el) basic_machine=sh5le-unknown ;; sh64) basic_machine=sh64-unknown ;; sparclite-wrs | simso-wrs) basic_machine=sparclite-wrs os=-vxworks ;; sps7) basic_machine=m68k-bull os=-sysv2 ;; spur) basic_machine=spur-unknown ;; st2000) basic_machine=m68k-tandem ;; stratus) basic_machine=i860-stratus os=-sysv4 ;; strongarm-* | thumb-*) basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'` ;; sun2) basic_machine=m68000-sun ;; sun2os3) basic_machine=m68000-sun os=-sunos3 ;; sun2os4) basic_machine=m68000-sun os=-sunos4 ;; sun3os3) basic_machine=m68k-sun os=-sunos3 ;; sun3os4) basic_machine=m68k-sun os=-sunos4 ;; sun4os3) basic_machine=sparc-sun os=-sunos3 ;; sun4os4) basic_machine=sparc-sun os=-sunos4 ;; sun4sol2) basic_machine=sparc-sun os=-solaris2 ;; sun3 | sun3-*) basic_machine=m68k-sun ;; sun4) basic_machine=sparc-sun ;; sun386 | sun386i | roadrunner) basic_machine=i386-sun ;; sv1) basic_machine=sv1-cray os=-unicos ;; symmetry) basic_machine=i386-sequent os=-dynix ;; t3e) basic_machine=alphaev5-cray os=-unicos ;; t90) basic_machine=t90-cray os=-unicos ;; tile*) basic_machine=$basic_machine-unknown os=-linux-gnu ;; tx39) basic_machine=mipstx39-unknown ;; tx39el) basic_machine=mipstx39el-unknown ;; toad1) basic_machine=pdp10-xkl os=-tops20 ;; tower | tower-32) basic_machine=m68k-ncr ;; tpf) basic_machine=s390x-ibm os=-tpf ;; udi29k) basic_machine=a29k-amd os=-udi ;; ultra3) basic_machine=a29k-nyu os=-sym1 ;; v810 | necv810) basic_machine=v810-nec os=-none ;; vaxv) basic_machine=vax-dec os=-sysv ;; vms) basic_machine=vax-dec os=-vms ;; vpp*|vx|vx-*) basic_machine=f301-fujitsu ;; vxworks960) basic_machine=i960-wrs os=-vxworks ;; vxworks68) basic_machine=m68k-wrs os=-vxworks ;; vxworks29k) basic_machine=a29k-wrs os=-vxworks ;; w65*) basic_machine=w65-wdc os=-none ;; w89k-*) basic_machine=hppa1.1-winbond os=-proelf ;; xbox) basic_machine=i686-pc os=-mingw32 ;; xps | xps100) basic_machine=xps100-honeywell ;; xscale-* | xscalee[bl]-*) basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'` ;; ymp) basic_machine=ymp-cray os=-unicos ;; z8k-*-coff) basic_machine=z8k-unknown os=-sim ;; z80-*-coff) basic_machine=z80-unknown os=-sim ;; none) basic_machine=none-none os=-none ;; # Here we handle the default manufacturer of certain CPU types. It is in # some cases the only manufacturer, in others, it is the most popular. w89k) basic_machine=hppa1.1-winbond ;; op50n) basic_machine=hppa1.1-oki ;; op60c) basic_machine=hppa1.1-oki ;; romp) basic_machine=romp-ibm ;; mmix) basic_machine=mmix-knuth ;; rs6000) basic_machine=rs6000-ibm ;; vax) basic_machine=vax-dec ;; pdp10) # there are many clones, so DEC is not a safe bet basic_machine=pdp10-unknown ;; pdp11) basic_machine=pdp11-dec ;; we32k) basic_machine=we32k-att ;; sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele) basic_machine=sh-unknown ;; sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) basic_machine=sparc-sun ;; cydra) basic_machine=cydra-cydrome ;; orion) basic_machine=orion-highlevel ;; orion105) basic_machine=clipper-highlevel ;; mac | mpw | mac-mpw) basic_machine=m68k-apple ;; pmac | pmac-mpw) basic_machine=powerpc-apple ;; *-unknown) # Make sure to match an already-canonicalized machine name. ;; *) echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 exit 1 ;; esac # Here we canonicalize certain aliases for manufacturers. case $basic_machine in *-digital*) basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` ;; *-commodore*) basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` ;; *) ;; esac # Decode manufacturer-specific aliases for certain operating systems. if [ x"$os" != x"" ] then case $os in # First match some system type aliases # that might get confused with valid system types. # -solaris* is a basic system type, with this one exception. -auroraux) os=-auroraux ;; -solaris1 | -solaris1.*) os=`echo $os | sed -e 's|solaris1|sunos4|'` ;; -solaris) os=-solaris2 ;; -svr4*) os=-sysv4 ;; -unixware*) os=-sysv4.2uw ;; -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; # First accept the basic system types. # The portable systems comes first. # Each alternative MUST END IN A *, to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ | -sym* | -kopensolaris* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* | -aros* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ | -bitrig* | -openbsd* | -solidbsd* \ | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ | -chorusos* | -chorusrdb* | -cegcc* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ | -uxpv* | -beos* | -mpeix* | -udk* \ | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) case $basic_machine in x86-* | i*86-*) ;; *) os=-nto$os ;; esac ;; -nto-qnx*) ;; -nto*) os=`echo $os | sed -e 's|nto|nto-qnx|'` ;; -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) ;; -mac*) os=`echo $os | sed -e 's|mac|macos|'` ;; -linux-dietlibc) os=-linux-dietlibc ;; -linux*) os=`echo $os | sed -e 's|linux|linux-gnu|'` ;; -sunos5*) os=`echo $os | sed -e 's|sunos5|solaris2|'` ;; -sunos6*) os=`echo $os | sed -e 's|sunos6|solaris3|'` ;; -opened*) os=-openedition ;; -os400*) os=-os400 ;; -wince*) os=-wince ;; -osfrose*) os=-osfrose ;; -osf*) os=-osf ;; -utek*) os=-bsd ;; -dynix*) os=-bsd ;; -acis*) os=-aos ;; -atheos*) os=-atheos ;; -syllable*) os=-syllable ;; -386bsd) os=-bsd ;; -ctix* | -uts*) os=-sysv ;; -nova*) os=-rtmk-nova ;; -ns2 ) os=-nextstep2 ;; -nsk*) os=-nsk ;; # Preserve the version number of sinix5. -sinix5.*) os=`echo $os | sed -e 's|sinix|sysv|'` ;; -sinix*) os=-sysv4 ;; -tpf*) os=-tpf ;; -triton*) os=-sysv3 ;; -oss*) os=-sysv3 ;; -svr4) os=-sysv4 ;; -svr3) os=-sysv3 ;; -sysvr4) os=-sysv4 ;; # This must come after -sysvr4. -sysv*) ;; -ose*) os=-ose ;; -es1800*) os=-ose ;; -xenix) os=-xenix ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) os=-mint ;; -aros*) os=-aros ;; -kaos*) os=-kaos ;; -zvmoe) os=-zvmoe ;; -dicos*) os=-dicos ;; -nacl*) ;; -none) ;; *) # Get rid of the `-' at the beginning of $os. os=`echo $os | sed 's/[^-]*-//'` echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 exit 1 ;; esac else # Here we handle the default operating systems that come with various machines. # The value should be what the vendor currently ships out the door with their # machine or put another way, the most popular os provided with the machine. # Note that if you're going to try to match "-MANUFACTURER" here (say, # "-sun"), then you have to tell the case statement up towards the top # that MANUFACTURER isn't an operating system. Otherwise, code above # will signal an error saying that MANUFACTURER isn't an operating # system, and we'll never get to this point. case $basic_machine in score-*) os=-elf ;; spu-*) os=-elf ;; *-acorn) os=-riscix1.2 ;; arm*-rebel) os=-linux ;; arm*-semi) os=-aout ;; c4x-* | tic4x-*) os=-coff ;; hexagon-*) os=-elf ;; tic54x-*) os=-coff ;; tic55x-*) os=-coff ;; tic6x-*) os=-coff ;; # This must come before the *-dec entry. pdp10-*) os=-tops20 ;; pdp11-*) os=-none ;; *-dec | vax-*) os=-ultrix4.2 ;; m68*-apollo) os=-domain ;; i386-sun) os=-sunos4.0.2 ;; m68000-sun) os=-sunos3 ;; m68*-cisco) os=-aout ;; mep-*) os=-elf ;; mips*-cisco) os=-elf ;; mips*-*) os=-elf ;; or32-*) os=-coff ;; *-tti) # must be before sparc entry or we get the wrong os. os=-sysv3 ;; sparc-* | *-sun) os=-sunos4.1.1 ;; *-be) os=-beos ;; *-haiku) os=-haiku ;; *-ibm) os=-aix ;; *-knuth) os=-mmixware ;; *-wec) os=-proelf ;; *-winbond) os=-proelf ;; *-oki) os=-proelf ;; *-hp) os=-hpux ;; *-hitachi) os=-hiux ;; i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) os=-sysv ;; *-cbm) os=-amigaos ;; *-dg) os=-dgux ;; *-dolphin) os=-sysv3 ;; m68k-ccur) os=-rtu ;; m88k-omron*) os=-luna ;; *-next ) os=-nextstep ;; *-sequent) os=-ptx ;; *-crds) os=-unos ;; *-ns) os=-genix ;; i370-*) os=-mvs ;; *-next) os=-nextstep3 ;; *-gould) os=-sysv ;; *-highlevel) os=-bsd ;; *-encore) os=-bsd ;; *-sgi) os=-irix ;; *-siemens) os=-sysv4 ;; *-masscomp) os=-rtu ;; f30[01]-fujitsu | f700-fujitsu) os=-uxpv ;; *-rom68k) os=-coff ;; *-*bug) os=-coff ;; *-apple) os=-macos ;; *-atari*) os=-mint ;; *) os=-none ;; esac fi # Here we handle the case where we know the os, and the CPU type, but not the # manufacturer. We pick the logical manufacturer. vendor=unknown case $basic_machine in *-unknown) case $os in -riscix*) vendor=acorn ;; -sunos*) vendor=sun ;; -cnk*|-aix*) vendor=ibm ;; -beos*) vendor=be ;; -hpux*) vendor=hp ;; -mpeix*) vendor=hp ;; -hiux*) vendor=hitachi ;; -unos*) vendor=crds ;; -dgux*) vendor=dg ;; -luna*) vendor=omron ;; -genix*) vendor=ns ;; -mvs* | -opened*) vendor=ibm ;; -os400*) vendor=ibm ;; -ptx*) vendor=sequent ;; -tpf*) vendor=ibm ;; -vxsim* | -vxworks* | -windiss*) vendor=wrs ;; -aux*) vendor=apple ;; -hms*) vendor=hitachi ;; -mpw* | -macos*) vendor=apple ;; -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) vendor=atari ;; -vos*) vendor=stratus ;; esac basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` ;; esac echo $basic_machine$os exit # Local variables: # eval: (add-hook 'write-file-hooks 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" # End: x264-master/configure000077500000000000000000001576721502133446700150100ustar00rootroot00000000000000#!/bin/bash if test x"$1" = x"-h" -o x"$1" = x"--help" ; then cat <> config.log } log_ok() { echo "yes" >> config.log } log_fail() { echo "no" >> config.log } log_msg() { echo "$1" >> config.log } cc_cflags() { # several non gcc compilers issue an incredibly large number of warnings on high warning levels, # suppress them by reducing the warning level rather than having to use #pragmas for arg in $*; do [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= [ "$arg" = -Wno-maybe-uninitialized ] && arg= [[ "$arg" = -mpreferred-stack-boundary* ]] && arg= [[ "$arg" = -l* ]] && arg= [[ "$arg" = -L* ]] && arg= if [ $compiler_style = MS ]; then [ "$arg" = -ffast-math ] && arg="-fp:fast" [ "$arg" = -Wall ] && arg= [ "$arg" = -Werror ] && arg="-W3 -WX" [ "$arg" = -g ] && arg=-Z7 [ "$arg" = -fomit-frame-pointer ] && arg= [ "$arg" = -s ] && arg= [ "$arg" = -fPIC ] && arg= else [ "$arg" = -ffast-math ] && arg= [ "$arg" = -Wall ] && arg= [ "$arg" = -Werror ] && arg="-w3 -Werror" fi [ $compiler = CL -a "$arg" = -O3 ] && arg=-O2 [ -n "$arg" ] && echo -n "$arg " done } cl_ldflags() { for arg in $*; do arg=${arg/LIBPATH/libpath} [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L} [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware [ "$arg" = -s ] && arg= [ "$arg" = -Wl,-Bsymbolic ] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Werror ] && arg= [ "$arg" = -Wshadow ] && arg= [ "$arg" = -Wmaybe-uninitialized ] && arg= [[ "$arg" = -Qdiag-error* ]] && arg= arg=${arg/pthreadGC/pthreadVC} [ "$arg" = avifil32.lib ] && arg=vfw32.lib [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib [ "$arg" = gpac.lib ] && arg=libgpac.lib [ "$arg" = x264.lib ] && arg=libx264.lib [ -n "$arg" ] && echo -n "$arg " done } cc_check() { if [ -z "$3$4" ]; then if [ -z "$1$2" ]; then log_check "whether $CC works" elif [ -z "$1" ]; then log_check "for $2" else log_check "for $1" fi elif [ -z "$1" ]; then if [ -z "$2" ]; then if [ -z "$3" ]; then log_check "whether $CC supports $4" else log_check "whether $CC supports $3" fi else log_check "whether $CC supports $3 with $2" fi else log_check "for $3 in $1"; fi rm -f conftest.c for arg in $1; do echo "#include <$arg>" >> conftest.c done if [ -n "$4" ]; then echo "$4" >> conftest.c fi echo "int main (void) { $3 return 0; }" >> conftest.c if [ $compiler_style = MS ]; then cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CFLAGSCLI $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" else cc_cmd="$CC conftest.c $CFLAGS $CFLAGSCLI $CHECK_CFLAGS $2 $LDFLAGSCLI $LDFLAGS -o conftest" fi if $cc_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$cc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.c >> config.log log_msg "--------------------------------------------------" fi return $res } cpp_check() { log_check "whether $3 is true" rm -f conftest.c for arg in $1; do echo "#include <$arg>" >> conftest.c done echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c if [ $compiler_style = MS ]; then cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P" else cpp_cmd="$CC conftest.c $CFLAGS $2 -E -o conftest" fi if $cpp_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "--------------------------------------------------" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.c >> config.log log_msg "--------------------------------------------------" fi return $res } as_check() { log_check "whether $AS supports $1" echo "$1" > conftest$AS_EXT as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o" if $as_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$as_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest$AS_EXT >> config.log log_msg "--------------------------------------------------" fi return $res } as_archext_check() { feature="$1" instr="$2" feature_upper="$(echo $feature | tr a-z A-Z)" header=".arch $as_arch_level ${NL}" if as_check "$header .arch_extension $feature" ; then define HAVE_AS_ARCHEXT_${feature_upper}_DIRECTIVE header="$header .arch_extension $feature ${NL}" fi as_check "$header $instr" && define HAVE_${feature_upper} } rc_check() { log_check "whether $RC works" echo "$1" > conftest.rc if [ $compiler = GNU ]; then rc_cmd="$RC $RCFLAGS -o conftest.o conftest.rc" else rc_cmd="$RC $RCFLAGS -foconftest.o conftest.rc" fi if $rc_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$rc_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" cat conftest.rc >> config.log log_msg "--------------------------------------------------" fi return $res } pkg_check() { log_check "for packages: $1" pkg_cmd="$PKGCONFIG --exists $1" if $pkg_cmd >conftest.log 2>&1; then res=$? log_ok else res=$? log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" log_msg "$pkg_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" fi return $res } define() { echo "#define $1$([ -n "$2" ] && echo " $2" || echo " 1")" >> config.h } die() { log_msg "DIED: $@" echo "$@" exit 1 } configure_system_override() { log_check "system libx264 configuration" x264_config_path="$1/x264_config.h" if [ -e "$x264_config_path" ]; then res=$? log_ok arg="$(grep '#define X264_GPL ' $x264_config_path | sed -e 's/#define X264_GPL *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="no" || arg="yes" [ "$arg" != "$gpl" ] && die "Incompatible license with system libx264" fi arg="$(grep '#define X264_BIT_DEPTH ' $x264_config_path | sed -e 's/#define X264_BIT_DEPTH *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="all" if [ "$arg" != "$bit_depth" ]; then echo "Override output bit depth with system libx264 configuration" bit_depth="$arg" fi fi arg="$(grep '#define X264_CHROMA_FORMAT ' $x264_config_path | sed -e 's/#define X264_CHROMA_FORMAT *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="all" || arg="${arg#X264_CSP_I}" if [ "$arg" != "$chroma_format" ]; then echo "Override output chroma format with system libx264 configuration" chroma_format="$arg" fi fi arg="$(grep '#define X264_INTERLACED ' $x264_config_path | sed -e 's/#define X264_INTERLACED *//; s/ *$//')" if [ -n "$arg" ]; then [ "$arg" = 0 ] && arg="no" || arg="yes" if [ "$arg" != "$interlaced" ]; then echo "Override interlaced encoding support with system libx264 configuration" interlaced="$arg" fi fi else res=$? log_fail log_msg "Failed search path was: $x264_config_path" fi return $res } rm -f x264_config.h config.h config.mak config.log x264.pc x264.def rm -rf conftest* # Construct a path to the specified directory relative to the working directory relative_path() { local base="${PWD%/}" local path="$(cd "$1" >/dev/null; printf '%s/.' "${PWD%/}")" local up='' while [[ $path != "$base/"* ]]; do base="${base%/*}" up="../$up" done dirname "$up${path#"$base/"}" } SRCPATH="$(relative_path "$(dirname "$0")")" echo "$SRCPATH" | grep -q ' ' && die "Out of tree builds are impossible with whitespace in source path." [ -e "$SRCPATH/config.h" -o -e "$SRCPATH/x264_config.h" ] && die "Out of tree builds are impossible with config.h/x264_config.h in source dir." prefix='/usr/local' exec_prefix='${prefix}' bindir='${exec_prefix}/bin' libdir='${exec_prefix}/lib' includedir='${prefix}/include' DEVNULL='/dev/null' cli="yes" cli_libx264="internal" shared="no" static="no" bashcompletion="auto" bashcompletionsdir="" avs="auto" lavf="auto" ffms="auto" gpac="auto" lsmash="auto" mp4="no" gpl="yes" thread="auto" swscale="auto" asm="auto" interlaced="yes" lto="no" debug="no" gprof="no" strip="no" pic="no" bit_depth="all" chroma_format="all" compiler="GNU" compiler_style="GNU" opencl="yes" vsx="auto" CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)" CFLAGSSO="$CFLAGSSO" CFLAGSCLI="$CFLAGSCLI" LDFLAGS="$LDFLAGS" LDFLAGSCLI="$LDFLAGSCLI" ASFLAGS="$ASFLAGS -I. -I\$(SRCPATH)" RCFLAGS="$RCFLAGS" CHECK_CFLAGS="" HAVE_GETOPT_LONG=1 cross_prefix="" EXE="" AS_EXT=".S" NL=" " # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \ MSA LSX MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10 ELF_AUX_INFO GETAUXVAL \ SYSCONF SYNC_FETCH_AND_ADD \ DOTPROD I8MM SVE SVE2 \ AS_ARCHEXT_DOTPROD_DIRECTIVE AS_ARCHEXT_I8MM_DIRECTIVE AS_ARCHEXT_SVE_DIRECTIVE AS_ARCHEXT_SVE2_DIRECTIVE" # parse options for opt do optarg="${opt#*=}" case "$opt" in --prefix=*) prefix="$optarg" ;; --exec-prefix=*) exec_prefix="$optarg" ;; --bindir=*) bindir="$optarg" ;; --libdir=*) libdir="$optarg" ;; --includedir=*) includedir="$optarg" ;; --disable-cli) cli="no" ;; --system-libx264) cli_libx264="system" ;; --enable-shared) shared="yes" ;; --enable-static) static="yes" ;; --disable-bashcompletion) bashcompletion="no" ;; --enable-bashcompletion) bashcompletion="yes" ;; --bashcompletionsdir=*) bashcompletionsdir="$optarg" ;; --disable-asm) asm="no" ;; --disable-interlaced) interlaced="no" ;; --disable-avs) avs="no" ;; --disable-lavf) lavf="no" ;; --disable-ffms) ffms="no" ;; --disable-gpac) gpac="no" ;; --disable-lsmash) lsmash="no" ;; --disable-gpl) gpl="no" ;; --extra-asflags=*) ASFLAGS="$ASFLAGS $optarg" ;; --extra-cflags=*) CFLAGS="$CFLAGS $optarg" ;; --extra-ldflags=*) LDFLAGS="$LDFLAGS $optarg" ;; --extra-rcflags=*) RCFLAGS="$RCFLAGS $optarg" ;; --disable-thread) thread="no" ;; --disable-win32thread) [ "$thread" != "no" ] && thread="posix" ;; --disable-swscale) swscale="no" ;; --enable-lto) lto="auto" ;; --enable-debug) debug="yes" ;; --enable-gprof) CFLAGS="$CFLAGS -pg" LDFLAGS="$LDFLAGS -pg" gprof="yes" ;; --enable-strip) strip="yes" ;; --enable-pic) pic="yes" ;; --host=*) host="$optarg" ;; --disable-vsx) vsx="no" ;; --disable-opencl) opencl="no" ;; --cross-prefix=*) cross_prefix="$optarg" ;; --sysroot=*) CFLAGS="$CFLAGS --sysroot=$optarg" LDFLAGS="$LDFLAGS --sysroot=$optarg" ;; --bit-depth=*) bit_depth="$optarg" if [ "$bit_depth" != "8" -a "$bit_depth" != "10" -a "$bit_depth" != "all" ]; then echo "Supplied bit depth must be 8, 10 or all." exit 1 fi ;; --chroma-format=*) chroma_format="$optarg" if [ $chroma_format != "400" -a $chroma_format != "420" -a $chroma_format != "422" -a $chroma_format != "444" -a $chroma_format != "all" ]; then echo "Supplied chroma format must be 400, 420, 422, 444 or all." exit 1 fi ;; *) echo "Unknown option $opt, ignored" ;; esac done [ "$cli" = "no" -a "$shared" = "no" -a "$static" = "no" ] && die "Nothing to build. Enable cli, shared or static." CC="${CC-${cross_prefix}gcc}" STRIP="${STRIP-${cross_prefix}strip}" STRINGS="${STRINGS-${cross_prefix}strings}" INSTALL="${INSTALL-install}" PKGCONFIG="${PKGCONFIG-${cross_prefix}pkg-config}" # ar and ranlib doesn't load the LTO plugin by default, prefer the gcc-prefixed wrappers which does. if ${cross_prefix}gcc-ar --version >/dev/null 2>&1; then AR="${AR-${cross_prefix}gcc-ar}" else AR="${AR-${cross_prefix}ar}" fi if ${cross_prefix}gcc-ranlib --version >/dev/null 2>&1; then RANLIB="${RANLIB-${cross_prefix}gcc-ranlib}" else RANLIB="${RANLIB-${cross_prefix}ranlib}" fi if [ "x$host" = x ]; then host="$(${SRCPATH}/config.guess)" fi # normalize a triplet into a quadruplet host="$(${SRCPATH}/config.sub $host)" # split $host host_cpu="${host%%-*}" host="${host#*-}" host_vendor="${host%%-*}" host_os="${host#*-}" trap 'rm -rf conftest*' EXIT # test for use of compilers that require specific handling cc_base="$(basename "$CC")" QPRE="-" if [[ $host_os = mingw* || $host_os = msys* || $host_os = cygwin* ]]; then if [[ "$cc_base" = icl || "$cc_base" = icl[\ .]* ]]; then # Windows Intel Compiler creates dependency generation with absolute Windows paths, Cygwin's make does not support Windows paths. [[ $host_os = cygwin* ]] && die "Windows Intel Compiler support requires MSYS" compiler=ICL compiler_style=MS CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras" QPRE="-Q" cpp_check '' '' '_MSC_VER >= 1400' || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" if cpp_check '' '' 'defined(_M_AMD64) || defined(_M_X64)' ; then host_cpu=x86_64 elif cpp_check '' '' 'defined(_M_IX86)' ; then host_cpu=i486 fi if cc_check '' -Qdiag-error:10006,10157 ; then CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157" fi elif [[ "$cc_base" = cl || "$cc_base" = cl[\ .]* ]]; then # Standard Microsoft Visual Studio compiler=CL compiler_style=MS CFLAGS="$CFLAGS -nologo -GS- -DHAVE_STRING_H -I\$(SRCPATH)/extras" cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer" if cpp_check '' '' 'defined(_M_AMD64) || defined(_M_X64)' ; then host_cpu=x86_64 elif cpp_check '' '' 'defined(_M_IX86)' ; then host_cpu=i486 elif cpp_check '' '' 'defined(_M_ARM64)' ; then host_cpu=aarch64 elif cpp_check '' '' 'defined(_M_ARM)' ; then host_cpu=arm fi else # MinGW uses broken pre-VS2015 Microsoft printf functions unless it's told to use the POSIX ones. CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L" fi else if [[ "$cc_base" = icc || "$cc_base" = icc[\ .]* ]]; then AR="xiar" compiler=ICC fi fi if [ $compiler = GNU ]; then if cc_check '' -Werror=unknown-warning-option ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option" fi if cc_check '' -Werror=unknown-attributes ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-attributes" fi if cc_check '' -Werror=attributes ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=attributes" fi if cc_check '' -Werror=ignored-attributes ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=ignored-attributes" fi if cc_check '' -Werror=implicit-function-declaration ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=implicit-function-declaration" fi fi libm="" case $host_os in beos*) SYS="BEOS" define HAVE_MALLOC_H ;; darwin*) SYS="MACOSX" libm="-lm" if [ "$pic" = "no" ]; then cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic" fi ;; dragonfly*) SYS="DRAGONFLY" libm="-lm" ;; freebsd*) SYS="FREEBSD" libm="-lm" ;; kfreebsd*-gnu) SYS="FREEBSD" define HAVE_MALLOC_H libm="-lm" ;; netbsd*) SYS="NETBSD" libm="-lm" ;; openbsd*) SYS="OPENBSD" libm="-lm" ;; *linux*) SYS="LINUX" define HAVE_MALLOC_H libm="-lm" ;; gnu*) SYS="HURD" define HAVE_MALLOC_H libm="-lm" ;; cygwin*|mingw*|msys*) EXE=".exe" if [[ $host_os = cygwin* ]] && cpp_check "" "" "defined(__CYGWIN__)" ; then SYS="CYGWIN" define HAVE_MALLOC_H else SYS="WINDOWS" DEVNULL="NUL" cc_check '' -lshell32 && LDFLAGSCLI="$LDFLAGSCLI -lshell32" [ $compiler = GNU ] && RC="${RC-${cross_prefix}windres}" || RC="${RC-rc.exe}" fi ;; sunos*|solaris*) SYS="SunOS" define HAVE_MALLOC_H libm="-lm" if cc_check "" /usr/lib/64/values-xpg6.o; then LDFLAGS="$LDFLAGS /usr/lib/64/values-xpg6.o" else LDFLAGS="$LDFLAGS /usr/lib/values-xpg6.o" fi if test -x /usr/ucb/install ; then INSTALL=/usr/ucb/install elif test -x /usr/bin/ginstall ; then # OpenSolaris INSTALL=/usr/bin/ginstall elif test -x /usr/gnu/bin/install ; then # OpenSolaris INSTALL=/usr/gnu/bin/install fi HAVE_GETOPT_LONG=0 ;; *qnx*) SYS="QNX" define HAVE_MALLOC_H libm="-lm" HAVE_GETOPT_LONG=0 CFLAGS="$CFLAGS -I\$(SRCPATH)/extras" ;; *haiku*) SYS="HAIKU" ;; *) die "Unknown system $host, edit the configure" ;; esac LDFLAGS="$LDFLAGS $libm" stack_alignment=4 case $host_cpu in i*86) ARCH="X86" AS="${AS-nasm}" AS_EXT=".asm" ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then CFLAGS="$CFLAGS -march=i686" fi if [[ "$asm" == auto && "$CFLAGS" != *-mfpmath* ]]; then CFLAGS="$CFLAGS -mfpmath=sse -msse -msse2" fi CFLAGS="-m32 $CFLAGS" LDFLAGS="-m32 $LDFLAGS" fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho32 -DPREFIX" elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware" [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS" else ASFLAGS="$ASFLAGS -f elf32" fi ;; x86_64) ARCH="X86_64" AS="${AS-nasm}" AS_EXT=".asm" ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" stack_alignment=16 [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS" if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho64 -DPREFIX" if cc_check '' "-arch x86_64"; then CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" fi elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win64" if [ $compiler = GNU ]; then # only the GNU toolchain is inconsistent in prefixing function names with _ cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" cc_check "" "-Wl,--high-entropy-va" && LDFLAGS="$LDFLAGS -Wl,--high-entropy-va" LDFLAGS="$LDFLAGS -Wl,--dynamicbase,--nxcompat,--tsaware" LDFLAGSCLI="$LDFLAGSCLI -Wl,--image-base,0x140000000" SOFLAGS="$SOFLAGS -Wl,--image-base,0x180000000" RCFLAGS="--target=pe-x86-64 $RCFLAGS" fi else ASFLAGS="$ASFLAGS -f elf64" fi ;; powerpc*) ARCH="PPC" if [ $asm = auto ] ; then define HAVE_ALTIVEC AS="${AS-${CC}}" AS_EXT=".c" if [ $SYS = MACOSX ] ; then CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4" else CFLAGS="$CFLAGS -maltivec -mabi=altivec" define HAVE_ALTIVEC_H fi if [ "$vsx" != "no" ] ; then vsx="no" if cc_check "" "-mvsx" ; then CFLAGS="$CFLAGS -mvsx" define HAVE_VSX vsx="yes" fi fi fi ;; sparc) ARCH="SPARC" ;; mips*) ARCH="MIPS" AS="${AS-${CC}}" AS_EXT=".c" ;; loongarch*) ARCH="LOONGARCH" ASFLAGS="$ASFLAGS -c" AS="${AS-${CC}}" AS_EXT=".S" ;; aarch64|arm64*) ARCH="AARCH64" stack_alignment=16 if [ "$SYS" = MACOSX ] ; then AS="${AS-${CC}}" ASFLAGS="$ASFLAGS -DPREFIX -DPIC" if cc_check '' "-arch arm64"; then CFLAGS="$CFLAGS -arch arm64" LDFLAGS="$LDFLAGS -arch arm64" ASFLAGS="$ASFLAGS -arch arm64" fi elif [ "$SYS" = WINDOWS ] && [ "$compiler" = CL ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch aarch64 -as-type armasm -- armasm64 -nologo}" else AS="${AS-${CC}}" fi ;; arm*) ARCH="ARM" if [ "$SYS" = MACOSX ] ; then AS="${AS-${CC}}" ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all # build for armv7 by default if ! echo $CFLAGS | grep -Eq '\-arch' ; then CFLAGS="$CFLAGS -arch armv7" LDFLAGS="$LDFLAGS -arch armv7" fi elif [ "$SYS" = WINDOWS ] && [ "$compiler" = CL ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -as-type armasm -force-thumb -- armasm -nologo -ignore 4509}" elif [ "$SYS" = WINDOWS ] ; then AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -as-type clang -force-thumb -- ${CC} -mimplicit-it=always}" else AS="${AS-${CC}}" fi ;; s390|s390x) ARCH="S390" ;; hppa*|parisc*) ARCH="PARISC" ;; ia64) ARCH="IA64" ;; alpha*) ARCH="ALPHA" ;; *) ARCH="$(echo $host_cpu | tr a-z A-Z)" ;; esac [ "$vsx" != "yes" ] && vsx="no" if [ $SYS = WINDOWS ]; then if ! rc_check "0 RCDATA {0}" ; then RC="" fi if cpp_check "winapifamily.h" "" "!WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)" ; then [ $compiler = CL ] || die "WinRT requires MSVC" define HAVE_WINRT CFLAGS="$CFLAGS -MD" LDFLAGS="$LDFLAGS -appcontainer" if ! cpp_check "" "" "defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0603" ; then die "_WIN32_WINNT must be defined to at least 0x0603 (Windows 8.1) for WinRT" elif cpp_check "" "" "_WIN32_WINNT >= 0x0A00" ; then # Universal Windows Platform (Windows 10) LDFLAGS="$LDFLAGS -lWindowsApp" fi cli="no" opencl="no" fi fi log_msg "x264 configure script" if [ -n "$*" ]; then msg="Command line options:" for i in $@; do msg="$msg \"$i\"" done log_msg "$msg" fi log_msg "" # check requirements cc_check || die "No working C compiler found." if [ $compiler_style = GNU ]; then if cc_check '' -std=gnu99 'for( int i = 0; i < 9; i++ );' ; then CFLAGS="$CFLAGS -std=gnu99 -D_GNU_SOURCE" elif cc_check '' -std=c99 'for( int i = 0; i < 9; i++ );' ; then CFLAGS="$CFLAGS -std=c99 -D_POSIX_C_SOURCE=200112L -D_BSD_SOURCE" elif ! cc_check '' '' 'for( int i = 0; i < 9; i++ );' ; then die "C99 compiler is needed for compilation." fi fi if [ $shared = yes ] ; then pic="yes" fi if cc_check '' '' '' '__attribute__((force_align_arg_pointer))' ; then if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if cc_check '' -mpreferred-stack-boundary=6 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=6" stack_alignment=64 elif cc_check '' -mstack-alignment=64 ; then CFLAGS="$CFLAGS -mstack-alignment=64" stack_alignment=64 elif [ $stack_alignment -lt 16 ] ; then if cc_check '' -mpreferred-stack-boundary=4 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=4" stack_alignment=16 elif cc_check '' -mstack-alignment=16 ; then CFLAGS="$CFLAGS -mstack-alignment=16" stack_alignment=16 fi fi elif [ $compiler = ICC -a $ARCH = X86 ]; then # icc on linux has various degrees of mod16 stack support if [ $SYS = LINUX ]; then # >= 12 defaults to a mod16 stack if cpp_check "" "" "__INTEL_COMPILER >= 1200" ; then stack_alignment=16 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. elif cpp_check "" "" "__INTEL_COMPILER >= 1100" ; then CFLAGS="$CFLAGS -falign-stack=assume-16-byte" stack_alignment=16 fi # < 11 is completely incapable of keeping a mod16 stack fi fi fi if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if ! as_check "vmovdqa32 [eax]{k1}{z}, zmm0" ; then VER="$( ($AS --version || echo no assembler) 2>/dev/null | head -n 1 )" echo "Found $VER" echo "Minimum version is nasm-2.13" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM define HAVE_MMX fi if [ $asm = auto -a $ARCH = ARM ] ; then # set flags so neon is built by default [ $compiler == CL ] || echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu)' || CFLAGS="$CFLAGS -mcpu=cortex-a8 -mfpu=neon" cc_check '' '' '__asm__("add r0, r1, r2");' && define HAVE_ARM_INLINE_ASM if [ $compiler = CL ] && cpp_check '' '' 'defined(_M_ARM) && _M_ARM >= 7' ; then define HAVE_ARMV6 define HAVE_ARMV6T2 define HAVE_NEON elif cc_check '' '' '__asm__("rev ip, ip");' ; then define HAVE_ARMV6 cc_check '' '' '__asm__("movt r0, #0");' && define HAVE_ARMV6T2 cc_check '' '' '__asm__("vadd.i16 q0, q0, q0");' && define HAVE_NEON ASFLAGS="$ASFLAGS -c" else echo "You specified a pre-ARMv6 or Thumb-1 CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi if [ $asm = auto -a $ARCH = AARCH64 ] ; then if [ $compiler != CL ] ; then ASFLAGS="$ASFLAGS -c" fi if as_check "cmeq v0.8h, v0.8h, #0" ; then define HAVE_AARCH64 define HAVE_NEON else echo "no NEON support, try adding -mfpu=neon to CFLAGS" echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi # Check for higher .arch levels. We only need armv8.2-a in order to # enable the extensions we want below - we primarily want to control # them via .arch_extension. However: # # Clang before version 17 (Xcode versions before 16) didn't support # controlling the dotprod/i8mm extensions via .arch_extension; thus # try to enable them via the .arch level as well. as_arch_level="armv8-a" for level in armv8.2-a armv8.4-a armv8.6-a; do as_check ".arch ${level}" && as_arch_level="$level" done # Clang before version 17 (Xcode versions before 16) also had a bug # (https://github.com/llvm/llvm-project/issues/32220) causing a plain # ".arch " to not have any effect unless it had an extra # "+" included - but it was activated on the next # ".arch_extension" directive. Check if we can include "+crc" as dummy # feature to make the .arch directive behave as expected and take # effect right away. as_check ".arch ${as_arch_level}+crc" && as_arch_level="${as_arch_level}+crc" define AS_ARCH_LEVEL "$as_arch_level" as_archext_check dotprod "udot v0.4s, v0.16b, v0.16b" as_archext_check i8mm "usdot v0.4s, v0.16b, v0.16b" as_archext_check sve "ptrue p0.b, vl16" as_archext_check sve2 "smlalb z10.s, z2.h, z1.h" fi if [ $asm = auto -a \( $ARCH = ARM -o $ARCH = AARCH64 \) ] ; then # check if the assembler supports '.func' (clang 3.5 does not) as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1 fi if [ $asm = auto -a $ARCH = MIPS ] ; then if ! echo $CFLAGS | grep -Eq '(-march|-mmsa|-mno-msa)' ; then cc_check '' '-mmsa -mfp64 -mhard-float' && CFLAGS="-mmsa -mfp64 -mhard-float $CFLAGS" fi if cc_check '' '' '__asm__("addvi.b $w0, $w1, 1");' ; then define HAVE_MSA else echo "You specified a pre-MSA CPU in your CFLAGS." echo "If you really want to run on such a CPU, configure with --disable-asm." exit 1 fi fi if [ $asm = auto -a $ARCH = LOONGARCH ] ; then if cc_check '' '' '__asm__("xvadd.b $xr0, $xr1, $xr2");' ; then # Use HAVE_LSX as the base flag, compiler support LA SIMD(LSX and LASX) define HAVE_LSX fi fi [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes" define ARCH_$ARCH define SYS_$SYS define STACK_ALIGNMENT $stack_alignment ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment" # skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well CPU_ENDIAN="little-endian" if [ $compiler = GNU ]; then echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c $CC $CFLAGS -fno-lto conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed" if (${STRINGS} -a conftest.o | grep -q BIGE) && (${STRINGS} -a conftest.o | grep -q FPendian) ; then define WORDS_BIGENDIAN CPU_ENDIAN="big-endian" elif !(${STRINGS} -a conftest.o | grep -q EGIB && ${STRINGS} -a conftest.o | grep -q naidnePF) ; then die "endian test failed" fi fi if [ "$cli_libx264" = "system" -a "$shared" != "yes" ] ; then [ "$static" = "yes" ] && die "Option --system-libx264 can not be used together with --enable-static" if pkg_check x264 ; then X264_LIBS="$($PKGCONFIG --libs x264)" X264_CFLAGS="$($PKGCONFIG --cflags x264)" X264_INCLUDE_DIR="${X264_INCLUDE_DIR-$($PKGCONFIG --variable=includedir x264)}" configure_system_override "$X264_INCLUDE_DIR" || die "Detection of system libx264 configuration failed" else die "Can not find system libx264" fi fi # autodetect options that weren't forced nor disabled libpthread="" if [ "$SYS" = "WINDOWS" -a "$thread" = "posix" ] ; then if [ "$gpl" = "no" ] ; then echo "Warning: pthread-win32 is LGPL and is therefore not supported with --disable-gpl" thread="no" elif cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then libpthread="-lpthread" elif cc_check pthread.h -lpthreadGC2 "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2" elif cc_check pthread.h "-lpthreadGC2 -lwsock32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2 -lwsock32" define PTW32_STATIC_LIB elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then libpthread="-lpthreadGC2 -lws2_32" define PTW32_STATIC_LIB else thread="no" fi elif [ "$thread" != "no" ] ; then thread="no" case $SYS in BEOS) thread="beos" define HAVE_BEOSTHREAD ;; WINDOWS) thread="win32" define HAVE_WIN32THREAD ;; QNX) cc_check pthread.h -lc "pthread_create(0,0,0,0);" && thread="posix" && libpthread="-lc" ;; *) if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then thread="posix" libpthread="-lpthread" else cc_check pthread.h "" "pthread_create(0,0,0,0);" && thread="posix" && libpthread="" fi ;; esac fi if [ "$thread" = "posix" ]; then LDFLAGS="$LDFLAGS $libpthread" define HAVE_POSIXTHREAD if [ "$SYS" = "LINUX" ] && cc_check sched.h "-D_GNU_SOURCE -Werror" "cpu_set_t p_aff; return CPU_COUNT(&p_aff);" ; then define HAVE_CPU_COUNT fi fi [ "$thread" != "no" ] && define HAVE_THREAD if cc_check '' '' 'int *val; __sync_fetch_and_add(&val, 1);' ; then define HAVE_SYNC_FETCH_AND_ADD fi if cc_check 'math.h' '' 'volatile float x = 2; return log2f(x);' ; then define HAVE_LOG2F fi if cc_check 'string.h' '' 'strtok_r(0, 0, 0);' ; then define HAVE_STRTOK_R fi if cc_check 'time.h' '' 'clock_gettime(CLOCK_MONOTONIC, 0);' ; then define HAVE_CLOCK_GETTIME elif cc_check 'time.h' '-lrt' 'clock_gettime(CLOCK_MONOTONIC, 0);' ; then define HAVE_CLOCK_GETTIME LDFLAGS="$LDFLAGS -lrt" fi if cc_check 'sys/auxv.h' '' 'getauxval(AT_HWCAP);' ; then define HAVE_GETAUXVAL fi if cc_check 'sys/auxv.h' '' 'unsigned long auxv = 0; elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));' ; then define HAVE_ELF_AUX_INFO fi if cc_check 'unistd.h' '' 'sysconf(0);' ; then define HAVE_SYSCONF fi if [ "$SYS" != "WINDOWS" ] && cpp_check "sys/mman.h unistd.h" "" "defined(MAP_PRIVATE)"; then define HAVE_MMAP fi if [ "$SYS" = "LINUX" -a \( "$ARCH" = "X86" -o "$ARCH" = "X86_64" \) ] && cc_check "sys/mman.h" "" "MADV_HUGEPAGE;" ; then define HAVE_THP fi if [ "$cli" = "no" ] ; then avs="no" lavf="no" ffms="no" gpac="no" lsmash="no" mp4="no" swscale="no" fi if [ "$swscale" = "auto" ] ; then swscale="no" if pkg_check 'libswscale libavutil' ; then SWSCALE_LIBS="$SWSCALE_LIBS $($PKGCONFIG --libs libswscale libavutil)" SWSCALE_CFLAGS="$SWSCALE_CFLAGS $($PKGCONFIG --cflags libswscale libavutil)" fi [ -z "$SWSCALE_LIBS" ] && SWSCALE_LIBS="-lswscale -lavutil" if cc_check "libswscale/swscale.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "sws_init_context(0,0,0);" ; then if cc_check "libavutil/pixdesc.h" "$SWSCALE_CFLAGS $SWSCALE_LIBS" "AVComponentDescriptor x; x.depth = 8;" ; then swscale="yes" else echo "Warning: libswscale is too old" fi fi fi if [ "$lavf" = "auto" ] ; then lavf="no" if pkg_check 'libavformat libavcodec libavutil' ; then LAVF_LIBS="$LAVF_LIBS $($PKGCONFIG --libs libavformat libavcodec libavutil)" LAVF_CFLAGS="$LAVF_CFLAGS $($PKGCONFIG --cflags libavformat libavcodec libavutil)" fi if [ -z "$LAVF_LIBS" ] && cc_check '' -lavformat ; then LAVF_LIBS="-lavformat" for lib in -lavcodec -lavresample -lswresample -lavutil -lbz2 -lz $libpthread -lole32 -luser32 -lws2_32 -lsecur32 ; do cc_check "" $lib && LAVF_LIBS="$LAVF_LIBS $lib" done fi if cc_check libavformat/avformat.h "$LAVF_CFLAGS $LAVF_LIBS" "av_demuxer_iterate(0);" ; then if cc_check libavcodec/avcodec.h "$LAVF_CFLAGS $LAVF_LIBS" "avcodec_send_packet(0,0);" ; then lavf="yes" else echo "Warning: libavformat is too old" fi fi if [ "$lavf" = "yes" -a "$swscale" = "no" ]; then echo "Warning: libavformat is not supported without swscale support" lavf="no" fi fi if [ "$ffms" = "auto" ] ; then ffms_major="2"; ffms_minor="21"; ffms_micro="0"; ffms_bump="0" ffms="no" if pkg_check ffms2 ; then FFMS2_LIBS="$FFMS2_LIBS $($PKGCONFIG --libs ffms2)" FFMS2_CFLAGS="$FFMS2_CFLAGS $($PKGCONFIG --cflags ffms2)" fi [ -z "$FFMS2_LIBS" ] && FFMS2_LIBS="-lffms2" if cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS" "FFMS_DestroyVideoSource(0);" ; then ffms="yes" elif cc_check ffms.h "$FFMS2_CFLAGS $FFMS2_LIBS -lstdc++ $LAVF_LIBS" "FFMS_DestroyVideoSource(0);" ; then ffms="yes" FFMS2_LIBS="$FFMS2_LIBS -lstdc++ $LAVF_LIBS" fi error="ffms must be at least version $ffms_major.$ffms_minor.$ffms_micro.$ffms_bump" if [ $ffms = "yes" ] && ! cpp_check "ffms.h" "$FFMS2_CFLAGS" "FFMS_VERSION >= (($ffms_major << 24) | ($ffms_minor << 16) | ($ffms_micro << 8) | $ffms_bump)" "$error"; then ffms="no" echo "Warning: $error" fi if [ "$ffms" = "yes" -a "$swscale" = "no" ]; then echo "Warning: ffms is not supported without swscale support" ffms="no" fi fi if [ "$swscale" = "yes" ]; then LDFLAGSCLI="$SWSCALE_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $SWSCALE_CFLAGS" define HAVE_SWSCALE if [ "$lavf" = "yes" ]; then LDFLAGSCLI="$LAVF_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $LAVF_CFLAGS" define HAVE_LAVF fi if [ "$ffms" = "yes" ]; then LDFLAGSCLI="$FFMS2_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $FFMS2_CFLAGS" define HAVE_FFMS fi fi if [ "$lsmash" = "auto" ] ; then lsmash="no" if pkg_check liblsmash ; then LSMASH_LIBS="$LSMASH_LIBS $($PKGCONFIG --libs liblsmash)" LSMASH_CFLAGS="$LSMASH_CFLAGS $($PKGCONFIG --cflags liblsmash)" fi [ -z "$LSMASH_LIBS" ] && LSMASH_LIBS="-llsmash" if cc_check lsmash.h "$LSMASH_CFLAGS $LSMASH_LIBS" "lsmash_destroy_root(0);" ; then if cpp_check lsmash.h "$LSMASH_CFLAGS" "LSMASH_VERSION_MAJOR > 1 || (LSMASH_VERSION_MAJOR == 1 && LSMASH_VERSION_MINOR >= 5)" ; then lsmash="yes" else echo "Warning: lsmash is too old, update to rev.895 or later" fi fi fi if [ "$gpac" = "auto" -a "$lsmash" != "yes" ] ; then gpac="no" if pkg_check gpac ; then GPAC_LIBS_TMP="$GPAC_LIBS $($PKGCONFIG --libs gpac)" GPAC_CFLAGS_TMP="$GPAC_CFLAGS $($PKGCONFIG --cflags gpac)" if cc_check gpac/isomedia.h "$GPAC_CFLAGS_TMP $GPAC_LIBS_TMP" "gf_isom_close(0);" ; then GPAC_LIBS="$GPAC_LIBS_TMP" GPAC_CFLAGS="$GPAC_CFLAGS_TMP" else GPAC_LIBS_TMP="$GPAC_LIBS $($PKGCONFIG --static --libs gpac | sed 's/-lgpac //')" GPAC_CFLAGS_TMP="$GPAC_CFLAGS $($PKGCONFIG --static --cflags gpac)" if cc_check gpac/isomedia.h "$GPAC_CFLAGS_TMP $GPAC_LIBS_TMP" "gf_isom_close(0);" ; then GPAC_LIBS="$GPAC_LIBS_TMP" GPAC_CFLAGS="$GPAC_CFLAGS_TMP" fi fi fi if [ -z "$GPAC_LIBS" ] ; then GPAC_LIBS="-lgpac_static" cc_check "" -lz && GPAC_LIBS="$GPAC_LIBS -lz" cc_check "" -ldl && GPAC_LIBS="$GPAC_LIBS -ldl" if [ "$SYS" = "WINDOWS" ] ; then cc_check "" -lws2_32 && GPAC_LIBS="$GPAC_LIBS -lws2_32" cc_check "" -lwinmm && GPAC_LIBS="$GPAC_LIBS -lwinmm" fi fi if cc_check gpac/isomedia.h "$GPAC_CFLAGS $GPAC_LIBS" "gf_isom_close(0);" ; then if cc_check gpac/isomedia.h "$GPAC_CFLAGS $GPAC_LIBS" "gf_isom_set_pixel_aspect_ratio(0,0,0,0,0,0);" ; then gpac="yes" else echo "Warning: gpac is too old, update to v0.8.0 or later" fi fi fi if [ "$lsmash" = "yes" ] ; then mp4="lsmash" LDFLAGSCLI="$LSMASH_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $LSMASH_CFLAGS" define HAVE_LSMASH elif [ "$gpac" = "yes" ] ; then mp4="gpac" LDFLAGSCLI="$GPAC_LIBS $LDFLAGSCLI" CFLAGS="$CFLAGS $GPAC_CFLAGS" define HAVE_GPAC fi if [ "$avs" = "auto" ] ; then avs="no" # cygwin can use avisynth if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then avs="yes" define HAVE_AVS elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then avs="yes" define HAVE_AVS AVS_LIBS="-ldl" LDFLAGSCLI="$AVS_LIBS $LDFLAGSCLI" elif [ "$SYS" = "DRAGONFLY" -o "$SYS" = "FREEBSD" -o "$SYS" = "NETBSD" -o "$SYS" = "OPENBSD" -o "$SYS" = "HAIKU" ] ; then avs="yes" define HAVE_AVS # dlopen is exported from libc on both *BSD and Haiku fi fi cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT if [ "$pic" = "yes" ] ; then [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC" [[ "$ASFLAGS" != *"-DPIC"* ]] && ASFLAGS="$ASFLAGS -DPIC" # resolve textrels in the x86 asm cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic" [ $SYS = SunOS -a "$ARCH" = "X86" ] && SOFLAGS="$SOFLAGS -mimpure-text" fi if [ "$debug" != "yes" -a "$gprof" != "yes" ]; then CFLAGS="$CFLAGS -fomit-frame-pointer" fi if [ "$strip" = "yes" ]; then LDFLAGS="$LDFLAGS -s" fi if [ "$debug" = "yes" ]; then CFLAGS="-O1 -g $CFLAGS" RCFLAGS="$RCFLAGS -DDEBUG" else CFLAGS="-O3 -ffast-math $CFLAGS" if [ "$lto" = "auto" ] && [ $compiler = GNU ] && cc_check "" "-flto" ; then lto="yes" CFLAGS="$CFLAGS -flto" LDFLAGS="$LDFLAGS -O3 -flto" fi fi [ "$lto" = "auto" ] && lto="no" if cc_check '' -fno-tree-vectorize ; then CFLAGS="$CFLAGS -fno-tree-vectorize" fi if [ $SYS = WINDOWS -a $ARCH = X86 -a $compiler = GNU ] ; then # workaround gcc/ld bug with alignment of static variables/arrays that are initialized to zero cc_check '' -fno-zero-initialized-in-bss && CFLAGS="$CFLAGS -fno-zero-initialized-in-bss" fi if cc_check "stdio.h" "-D_LARGEFILE_SOURCE=1 -D_FILE_OFFSET_BITS=64" "fseeko(stdin,0,0);" ; then define fseek fseeko define ftell ftello elif cc_check "stdio.h" "" "fseeko64(stdin,0,0);" ; then define fseek fseeko64 define ftell ftello64 elif cc_check "stdio.h" "" "_fseeki64(stdin,0,0);" ; then define fseek _fseeki64 define ftell _ftelli64 fi if cc_check '' -Wshadow ; then CFLAGS="-Wshadow $CFLAGS" fi if cc_check '' -Wmaybe-uninitialized ; then CFLAGS="-Wno-maybe-uninitialized $CFLAGS" fi if [ $compiler = GNU ] && cc_check '' -fvisibility=hidden ; then CFLAGS="$CFLAGS -fvisibility=hidden" fi if [ $compiler = ICC -o $compiler = ICL ] ; then if cc_check 'extras/intel_dispatcher.h' '' 'x264_intel_dispatcher_override();' ; then define HAVE_INTEL_DISPATCHER fi fi if [ "$bit_depth" = "all" ]; then define HAVE_BITDEPTH8 define HAVE_BITDEPTH10 elif [ "$bit_depth" -eq "8" ]; then define HAVE_BITDEPTH8 elif [ "$bit_depth" -eq "10" ]; then define HAVE_BITDEPTH10 opencl="no" fi if [ "$chroma_format" != "all" ]; then define CHROMA_FORMAT CHROMA_$chroma_format fi [ $gpl = yes ] && define HAVE_GPL && x264_gpl=1 || x264_gpl=0 [ $interlaced = yes ] && define HAVE_INTERLACED && x264_interlaced=1 || x264_interlaced=0 libdl="" if [ "$opencl" = "yes" ]; then opencl="no" # cygwin can use opencl if it can use LoadLibrary if [ $SYS = WINDOWS ] || ([ $SYS = CYGWIN ] && cc_check windows.h "" "LoadLibraryW(0);") ; then opencl="yes" define HAVE_OPENCL "(BIT_DEPTH==8)" elif [ "$SYS" = "LINUX" -o "$SYS" = "MACOSX" ] ; then opencl="yes" define HAVE_OPENCL "(BIT_DEPTH==8)" libdl="-ldl" fi LDFLAGS="$LDFLAGS $libdl" fi #define undefined vars as 0 for var in $CONFIG_HAVE; do grep -q "HAVE_$var " config.h || define HAVE_$var 0 done # generate exported config file [ "$bit_depth" = "all" ] && config_bit_depth="0" || config_bit_depth="$bit_depth" [ "$chroma_format" = "all" ] && config_chroma_format="0" || config_chroma_format="X264_CSP_I$chroma_format" cat > x264_config.h << EOF #define X264_GPL $x264_gpl #define X264_INTERLACED $x264_interlaced #define X264_BIT_DEPTH $config_bit_depth #define X264_CHROMA_FORMAT $config_chroma_format EOF ${SRCPATH}/version.sh >> x264_config.h if [ "$shared" = "yes" ]; then CFLAGSSO="$CFLAGSSO -DX264_API_EXPORTS" fi if [ "$cli_libx264" = "system" ] ; then if [ "$shared" = "yes" ]; then if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then CLI_LIBX264='$(IMPLIBNAME)' else CLI_LIBX264='$(SONAME)' fi CFLAGSCLI="$CFLAGSCLI -DX264_API_IMPORTS" else CLI_LIBX264= LDFLAGSCLI="$X264_LIBS $LDFLAGSCLI" CFLAGSCLI="$CFLAGSCLI $X264_CFLAGS" cc_check 'stdint.h x264.h' '' 'x264_encoder_open(0);' || die "System libx264 can't be used for compilation of this version" fi else CLI_LIBX264='$(LIBX264)' fi if [ $compiler_style = MS ]; then DEPFLAGS="" DEPCMD='@$(SRCPATH)/tools/msvsdepend.sh "$(CC)" "$(CFLAGS)" "$<" "$@" > $(@:.o=.d)' AR="lib.exe -nologo -out:" LD="link.exe -out:" if [ $compiler = ICL ]; then AR="xi$AR" LD="xi$LD" else mslink="$(dirname "$(command -v cl.exe 2>/dev/null)")/link.exe" [ -x "$mslink" ] && LD="\"$mslink\" -out:" fi HAVE_GETOPT_LONG=0 LDFLAGS="-nologo -incremental:no $(cl_ldflags $LDFLAGS)" LDFLAGSCLI="$(cl_ldflags $LDFLAGSCLI)" LIBX264=libx264.lib RANLIB= [ -n "$RC" ] && RCFLAGS="$RCFLAGS -nologo -I. -I\$(SRCPATH)/extras -fo" STRIP= if [ $debug = yes ]; then LDFLAGS="-debug $LDFLAGS" CFLAGS="-D_DEBUG $CFLAGS" else CFLAGS="-DNDEBUG $CFLAGS" fi else # gcc/icc DEPFLAGS="${QPRE}MMD ${QPRE}MF"' $(@:.o=.d)' DEPCMD="" AR="$AR rc " LD="$CC -o " LIBX264=libx264.a [ -n "$RC" ] && RCFLAGS="$RCFLAGS -I. -o " fi if [ $compiler != GNU ]; then CFLAGS="$(cc_cflags $CFLAGS)" CFLAGSSO="$(cc_cflags $CFLAGSSO)" CFLAGSCLI="$(cc_cflags $CFLAGSCLI)" fi if [ $compiler = ICC -o $compiler = ICL ]; then # icc does not define __SSE__ until SSE2 optimization and icl never defines it or _M_IX86_FP [ \( $ARCH = X86_64 -o $ARCH = X86 \) -a $asm = yes ] && ! cpp_check "" "" "defined(__SSE__)" && define __SSE__ PROF_GEN_CC="${QPRE}prof-gen ${QPRE}prof-dir." PROF_GEN_LD= PROF_USE_CC="${QPRE}prof-use ${QPRE}prof-dir." PROF_USE_LD= elif [ $compiler = CL ]; then # Visual Studio # _M_IX86_FP is only defined on x86 [ $ARCH = X86 ] && cpp_check '' '' '_M_IX86_FP >= 1' && define __SSE__ [ $ARCH = X86_64 ] && define __SSE__ # As long as the cli application can't link against the dll, the dll can not be pgo'd. # pgds are link flag specific and the -dll flag for creating the dll makes it unshareable with the cli PROF_GEN_CC="-GL" PROF_GEN_LD="-LTCG:PGINSTRUMENT" PROF_USE_CC="-GL" PROF_USE_LD="-LTCG:PGOPTIMIZE" else PROF_GEN_CC="-fprofile-generate" PROF_GEN_LD="-fprofile-generate" PROF_USE_CC="-fprofile-use" PROF_USE_LD="-fprofile-use" fi # generate config files cat > config.mak << EOF SRCPATH=$SRCPATH prefix=$prefix exec_prefix=$exec_prefix bindir=$bindir libdir=$libdir includedir=$includedir SYS_ARCH=$ARCH SYS=$SYS CC=$CC CFLAGS=$CFLAGS CFLAGSSO=$CFLAGSSO CFLAGSCLI=$CFLAGSCLI COMPILER=$compiler COMPILER_STYLE=$compiler_style DEPCMD=$DEPCMD DEPFLAGS=$DEPFLAGS LD=$LD LDFLAGS=$LDFLAGS LDFLAGSCLI=$LDFLAGSCLI LIBX264=$LIBX264 CLI_LIBX264=$CLI_LIBX264 AR=$AR RANLIB=$RANLIB STRIP=$STRIP INSTALL=$INSTALL AS=$AS ASFLAGS=$ASFLAGS RC=$RC RCFLAGS=$RCFLAGS EXE=$EXE HAVE_GETOPT_LONG=$HAVE_GETOPT_LONG DEVNULL=$DEVNULL PROF_GEN_CC=$PROF_GEN_CC PROF_GEN_LD=$PROF_GEN_LD PROF_USE_CC=$PROF_USE_CC PROF_USE_LD=$PROF_USE_LD HAVE_OPENCL=$opencl EOF if [ $compiler_style = MS ]; then echo 'CC_O=-Fo$@' >> config.mak else echo 'CC_O=-o $@' >> config.mak fi if [ "$cli" = "yes" ]; then echo 'default: cli' >> config.mak echo 'install: install-cli' >> config.mak fi if [ "$shared" = "yes" ]; then API=$(grep '#define X264_BUILD' < ${SRCPATH}/x264.h | cut -f 3 -d ' ') if [ "$SYS" = "WINDOWS" -o "$SYS" = "CYGWIN" ]; then echo "SONAME=libx264-$API.dll" >> config.mak if [ $compiler_style = MS ]; then echo 'IMPLIBNAME=libx264.dll.lib' >> config.mak echo "SOFLAGS=-dll -implib:\$(IMPLIBNAME) $SOFLAGS" >> config.mak else echo 'IMPLIBNAME=libx264.dll.a' >> config.mak echo "SOFLAGS=-shared -Wl,--out-implib,\$(IMPLIBNAME) $SOFLAGS" >> config.mak fi elif [ "$SYS" = "MACOSX" ]; then echo "SOSUFFIX=dylib" >> config.mak echo "SONAME=libx264.$API.dylib" >> config.mak echo "SOFLAGS=-shared -dynamiclib -Wl,-single_module -Wl,-read_only_relocs,suppress -install_name \$(DESTDIR)\$(libdir)/\$(SONAME) $SOFLAGS" >> config.mak elif [ "$SYS" = "SunOS" ]; then echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak echo "SOFLAGS=-shared -Wl,-h,\$(SONAME) $SOFLAGS" >> config.mak else echo "SOSUFFIX=so" >> config.mak echo "SONAME=libx264.so.$API" >> config.mak echo "SOFLAGS=-shared -Wl,-soname,\$(SONAME) $SOFLAGS" >> config.mak fi echo 'default: lib-shared' >> config.mak echo 'install: install-lib-shared' >> config.mak fi if [ "$static" = "yes" ]; then echo 'default: lib-static' >> config.mak echo 'install: install-lib-static' >> config.mak fi if [ "$bashcompletion" = "auto" ]; then if [ "$cli" = "no" ]; then bashcompletion="no" elif [[ -z "$bashcompletionsdir" && "$prefix" != "/usr" && "$prefix" != "/usr/"* ]]; then bashcompletion="no" fi fi if [ "$bashcompletion" != "no" ]; then if [ -z "$bashcompletionsdir" ] && pkg_check bash-completion ; then bashcompletionsdir="$($PKGCONFIG --variable=completionsdir bash-completion)" fi if [ -n "$bashcompletionsdir" ]; then bashcompletion="yes" echo 'install: install-bashcompletion' >> config.mak echo "BASHCOMPLETIONSDIR=$bashcompletionsdir" >> config.mak else bashcompletion="no" fi fi cat > x264.pc << EOF prefix=$prefix exec_prefix=$exec_prefix libdir=$libdir includedir=$includedir Name: x264 Description: H.264 (MPEG4 AVC) encoder library Version: $(grep POINTVER < x264_config.h | sed -e 's/.* "//; s/".*//; s/ .*//') Libs: -L$libdir -lx264 $([ "$shared" = "yes" ] || echo $libpthread $libm $libdl) Libs.private: $([ "$shared" = "yes" ] && echo $libpthread $libm $libdl) Cflags: -I$includedir $([ "$shared" = "yes" ] && echo "-DX264_API_IMPORTS") EOF filters="crop select_every" [ $swscale = yes ] && filters="resize $filters" cat > conftest.log <> config.log cat conftest.log >> config.log cat conftest.log [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile mkdir -p common/{aarch64,arm,mips,ppc,x86,loongarch} encoder extras filters/video input output tools echo echo "You can run 'make' or 'make fprofiled' now." x264-master/doc/000077500000000000000000000000001502133446700136245ustar00rootroot00000000000000x264-master/doc/ratecontrol.txt000066400000000000000000000120361502133446700167230ustar00rootroot00000000000000A qualitative overview of x264's ratecontrol methods By Loren Merritt Historical note: This document is outdated, but a significant part of it is still accurate. Here are some important ways ratecontrol has changed since the authoring of this document: - By default, MB-tree is used instead of qcomp for weighting frame quality based on complexity. MB-tree is effectively a generalization of qcomp to the macroblock level. MB-tree also replaces the constant offsets for B-frame quantizers. The legacy algorithm is still available for low-latency applications. - Adaptive quantization is now used to distribute quality among each frame; frames are no longer constant quantizer, even if MB-tree is off. - VBV runs per-row rather than per-frame to improve accuracy. x264's ratecontrol is based on libavcodec's, and is mostly empirical. But I can retroactively propose the following theoretical points which underlie most of the algorithms: - You want the movie to be somewhere approaching constant quality. However, constant quality does not mean constant PSNR nor constant QP. Details are less noticeable in high-complexity or high-motion scenes, so you can get away with somewhat higher QP for the same perceived quality. - On the other hand, you get more quality per bit if you spend those bits in scenes where motion compensation works well: A given artifact may stick around several seconds in a low-motion scene, and you only have to fix it in one frame to improve the quality of the whole scene. - Both of the above are correlated with the number of bits it takes to encode a frame at a given QP. - Given one encoding of a frame, we can predict the number of bits needed to encode it at a different QP. This prediction gets less accurate if the QPs are far apart. - The importance of a frame depends on the number of other frames that are predicted from it. Hence I-frames get reduced QP depending on the number and complexity of following inter-frames, disposable B-frames get higher QP than P-frames, and referenced B-frames are between P-frames and disposable B-frames. The modes: 2pass: Given some data about each frame of a 1st pass (e.g. generated by 1pass ABR, below), we try to choose QPs to maximize quality while matching a specified total size. This is separated into 3 parts: (1) Before starting the 2nd pass, select the relative number of bits to allocate between frames. This pays no attention to the total size of the encode. The default formula, empirically selected to balance between the 1st 2 theoretical points, is "complexity ** 0.6", where complexity is defined to be the bit size of the frame at a constant QP (estimated from the 1st pass). (2) Scale the results of (1) to fill the requested total size. Optional: Impose VBV limitations. Due to nonlinearities in the frame size predictor and in VBV, this is an iterative process. (3) Now start encoding. After each frame, update future QPs to compensate for mispredictions in size. If the 2nd pass is consistently off from the predicted size (usually because we use slower compression options than the 1st pass), then we multiply all future frames' qscales by the reciprocal of the error. Additionally, there is a short-term compensation to prevent us from deviating too far from the desired size near the beginning (when we don't have much data for the global compensation) and near the end (when global doesn't have time to react). 1pass, average bitrate: The goal is the same as in 2pass, but here we don't have the benefit of a previous encode, so all ratecontrol must be done during the encode. (1) This is the same as in 2pass, except that instead of estimating complexity from a previous encode, we run a fast motion estimation algo over a half-resolution version of the frame, and use the SATD residuals (these are also used in the decision between P- and B-frames). Also, we don't know the size or complexity of the following GOP, so I-frame bonus is based on the past. (2) We don't know the complexities of future frames, so we can only scale based on the past. The scaling factor is chosen to be the one that would have resulted in the desired bitrate if it had been applied to all frames so far. (3) Overflow compensation is the same as in 2pass. By tuning the strength of compensation, you can get anywhere from near the quality of 2pass (but unpredictable size, like +- 10%) to reasonably strict filesize but lower quality. 1pass, constant bitrate (VBV compliant): (1) Same as ABR. (2) Scaling factor is based on a local average (dependent on VBV buffer size) instead of all past frames. (3) Overflow compensation is stricter, and has an additional term to hard limit the QPs if the VBV is near empty. Note that no hard limit is done for a full VBV, so CBR may use somewhat less than the requested bitrate. Note also that if a frame violates VBV constraints despite the best efforts of prediction, it is not re-encoded. 1pass, constant ratefactor: (1) Same as ABR. (2) The scaling factor is a constant based on the --crf argument. (3) No overflow compensation is done. constant quantizer: QPs are simply based on frame type. x264-master/doc/regression_test.txt000066400000000000000000000011401502133446700176000ustar00rootroot00000000000000Here is one test method which checks that the encoder's view of decoded pictures in the same as the decoder's view. This ensures that there is no distortion besides what is inherently caused by compression. # Install and compile x264 : git clone git://git.videolan.org/x264.git x264 cd x264 ./configure make cd .. # Install and compile JM reference decoder : wget http://iphome.hhi.de/suehring/tml/download/jm17.2.zip unzip jm17.2.zip cd JM sh unixprep.sh cd ldecod make cd ../.. ./x264/x264 input.yuv --dump-yuv fdec.yuv -o output.h264 ./JM/bin/ldecod.exe -i output.h264 -o ref.yuv diff ref.yuv fdec.yuv x264-master/doc/standards.txt000066400000000000000000000013401502133446700163460ustar00rootroot00000000000000x264 is written in C. The particular variant of C is: intersection of C99 and gcc>=3.4. checkasm is written in gcc, with no attempt at compatibility with anything else. We make the following additional assumptions which are true of real systems but not guaranteed by C99: * Two's complement. * Signed right-shifts are sign-extended. * int is 32-bit or larger. x86-specific assumptions: * The stack is 16-byte aligned. We align it on entry to libx264 and on entry to any thread, but the compiler must preserve alignment after that. * We call emms before any float operation and before returning from libx264, not after each mmx operation. So bad things could happen if the compiler inserts float operations where they aren't expected. x264-master/doc/threads.txt000066400000000000000000000124131502133446700160200ustar00rootroot00000000000000Historical notes: Slice-based threads was the original threading model of x264. It was replaced with frame-based threads in r607. This document was originally written at that time. Slice-based threading was brought back (as an optional mode) in r1364 for low-latency encoding. Furthermore, frame-based threading was modified significantly in r1246, with the addition of threaded lookahead. Old threading method: slice-based application calls x264 x264 runs B-adapt and ratecontrol (serial) split frame into several slices, and spawn a thread for each slice wait until all threads are done deblock and hpel filter (serial) return to application In x264cli, there is one additional thread to decode the input. New threading method: frame-based application calls x264 x264 requests a frame from lookahead, which runs B-adapt and ratecontrol parallel to the current thread, separated by a buffer of size sync-lookahead spawn a thread for this frame thread runs encode, deblock, hpel filter meanwhile x264 waits for the oldest thread to finish return to application, but the rest of the threads continue running in the background No additional threads are needed to decode the input, unless decoding is slower than slice+deblock+hpel, in which case an additional input thread would allow decoding in parallel. Penalties for slice-based threading: Each slice adds some bitrate (or equivalently reduces quality), for a variety of reasons: the slice header costs some bits, cabac contexts are reset, mvs and intra samples can't be predicted across the slice boundary. In CBR mode, multiple slices encode simultaneously, thus increasing the maximum misprediction possible with VBV. Some parts of the encoder are serial, so it doesn't scale well with lots of cpus. Some numbers on penalties for slicing: Tested at 720p with 45 slices (one per mb row) to maximize the total cost for easy measurement. Averaged over 4 movies at crf20 and crf30. Total cost: +30% bitrate at constant psnr. I enabled the various components of slicing one at a time, and measured the portion of that cost they contribute: * 34% intra prediction * 25% redundant slice headers, nal headers, and rounding to whole bytes * 16% mv prediction * 16% reset cabac contexts * 6% deblocking between slices (you don't strictly have to turn this off just for standard compliance, but you do if you want to use slices for decoder multithreading) * 2% cabac neighbors (cbp, skip, etc) The proportional cost of redundant headers should certainly depend on bitrate (since the header size is constant and everything else depends on bitrate). Deblocking should too (due to varying deblock strength). But none of the proportions should depend strongly on the number of slices: some are triggered per slice while some are triggered per macroblock-that's-on-the-edge-of-a-slice, but as long as there's no more than 1 slice per row, the relative frequency of those two conditions is determined solely by the image width. Penalties for frame-base threading: To allow encoding of multiple frames in parallel, we have to ensure that any given macroblock uses motion vectors only from pieces of the reference frames that have been encoded already. This is usually not noticeable, but can matter for very fast upward motion. We have to commit to one frame type before starting on the frame. Thus scenecut detection must run during the lowres pre-motion-estimation along with B-adapt, which makes it faster but less accurate than re-encoding the whole frame. Ratecontrol gets delayed feedback, since it has to plan frame N before frame N-1 finishes. Benchmarks: cpu: 8core Nehalem (2x E5520) 2.27GHz, hyperthreading disabled kernel: linux 2.6.34.7, 64-bit x264: r1732 b20059aa input: http://media.xiph.org/video/derf/y4m/1080p/park_joy_1080p.y4m NOTE: the "thread count" listed below does not count the lookahead thread, only encoding threads. This is why for "veryfast", the speedup for 2 and 3 threads exceeds the logical limit. threads speedup psnr slice frame slice frame x264 --preset veryfast --tune psnr --crf 30 1: 1.00x 1.00x +0.000 +0.000 2: 1.41x 2.29x -0.005 -0.002 3: 1.70x 3.65x -0.035 +0.000 4: 1.96x 3.97x -0.029 -0.001 5: 2.10x 3.98x -0.047 -0.002 6: 2.29x 3.97x -0.060 +0.001 7: 2.36x 3.98x -0.057 -0.001 8: 2.43x 3.98x -0.067 -0.001 9: 3.96x +0.000 10: 3.99x +0.000 11: 4.00x +0.001 12: 4.00x +0.001 x264 --preset medium --tune psnr --crf 30 1: 1.00x 1.00x +0.000 +0.000 2: 1.54x 1.59x -0.002 -0.003 3: 2.01x 2.81x -0.005 +0.000 4: 2.51x 3.11x -0.009 +0.000 5: 2.89x 4.20x -0.012 -0.000 6: 3.27x 4.50x -0.016 -0.000 7: 3.58x 5.45x -0.019 -0.002 8: 3.79x 5.76x -0.015 -0.002 9: 6.49x -0.000 10: 6.64x -0.000 11: 6.94x +0.000 12: 6.96x +0.000 x264 --preset slower --tune psnr --crf 30 1: 1.00x 1.00x +0.000 +0.000 2: 1.54x 1.83x +0.000 +0.002 3: 1.98x 2.21x -0.006 +0.002 4: 2.50x 2.61x -0.011 +0.002 5: 2.93x 3.94x -0.018 +0.003 6: 3.45x 4.19x -0.024 +0.001 7: 3.84x 4.52x -0.028 -0.001 8: 4.13x 5.04x -0.026 -0.001 9: 6.15x +0.001 10: 6.24x +0.001 11: 6.55x -0.001 12: 6.89x -0.001 x264-master/doc/vui.txt000066400000000000000000000207321502133446700151740ustar00rootroot00000000000000Video Usability Information (VUI) Guide by Christian Heine ( sennindemokrit at gmx dot net ) 1. Sample Aspect Ratio ----------------------- * What is it? The Sample Aspect Ratio (SAR) (sometimes called Pixel Aspect Ratio or just Pel Aspect Ratio) is defined as the ratio of the width of the sample to the height of the sample. While pixels on a computer monitor generally are "square" meaning that their SAR is 1:1, digitized video usually has rather odd SARs. Playback of material with a particular SAR on a system with a different SAR will result in a stretched/squashed image. A correction is necessary that relies on the knowledge of both SARs. * How do I use it? You can derive the SAR of an image from the width, height and the display aspect ratio (DAR) of the image as follows: SAR_x DAR_x * height ----- = -------------- SAR_y DAR_y * width for example: width x height = 704x576, DAR = 4:3 ==> SAR = 2304:2112 or 12:11 Please note that if your material is a digitized analog signal, you should not use this equation to calculate the SAR. Refer to the manual of your digitizing equipment or this link instead. A Quick Guide to Digital Video Resolution and Aspect Ratio Conversions http://www.iki.fi/znark/video/conversion/ * Should I use this option? In one word: yes. Most decoders/ media players nowadays support automatic correction of aspect ratios, and there are just few exceptions. You should even use it, if the SAR of your material is 1:1, as the default of x264 is "SAR not defined". 2. Overscan ------------ * What is it? The term overscan generally refers to all regions of an image that do not contain information but are added to achieve a certain resolution or aspect ratio. A "letterboxed" image therefore has overscan at the top and the bottom. This is not the overscan this option refers to. Neither refers it to the overscan that is added as part of the process of digitizing an analog signal. Instead it refers to the "overscan" process on a display that shows only a part of the image. What that part is depends on the display. * How do I use this option? As I'm not sure about what part of the image is shown when the display uses an overscan process, I can't provide you with rules or examples. The safe assumption would be "overscan=show" as this always shows the whole image. Use "overscan=crop" only if you are sure about the consequences. You may also use the default value ("undefined"). * Should I use this option? Only if you know exactly what you are doing. Don't use it on video streams that have general overscan. Instead try to to crop the borders before encoding and benefit from the higher bitrate/ image quality. Furthermore the H264 specification says that the setting "overscan=show" must be respected, but "overscan=crop" may be ignored. In fact most playback equipment ignores this setting and shows the whole image. 3. Video Format ---------------- * What is it? A purely informative setting, that explains what the type of your analog video was, before you digitized it. * How do I use this option? Just set it to the desired value. ( e.g. NTSC, PAL ) If you transcode from MPEG2, you may find the value for this option in the m2v bitstream. (see ITU-T Rec. H262 / ISO/IEC 13818-2 for details) * Should I use this option? That is entirely up to you. I have no idea how this information would ever be relevant. I consider it to be informative only. 4. Full Range -------------- * What is it? Another relic from digitizing analog video. When digitizing analog video the digital representation of the luma and chroma levels is limited to lie within 16..235 and 16..240 respectively. Playback equipment usually assumes all digitized samples to be within this range. However most DVDs use the full range of 0..255 for luma and chroma samples, possibly resulting in an oversaturation when played back on that equipment. To avoid this a range correction is needed. * How do I use this option? If your source material is a digitized analog video/TV broadcast it is quite possible that it is range limited. If you can make sure that it is range limited you can safely set full range to off. If you are not sure or want to make sure that your material is played back without oversaturation, set if to on. Please note that the default for this option in x264 is off, which is not a safe assumption. * Should I use this option? Yes, but there are few decoders/ media players that distinguish between the two options. 5. Color Primaries, Transfer Characteristics, Matrix Coefficients ------------------------------------------------------------------- * What is it? A videophile setting. The average users won't ever need it. Not all monitor models show all colors the same way. When comparing the same image on two different monitor models you might find that one of them "looks more blue", while the other "looks more green". Bottom line is, each monitor model has a different color profile, which can be used to correct colors in a way, that images look almost the same on all monitors. The same goes for printers and film/ video digitizing equipment. If the color profile of the digitizing equipment is known, it is possible to correct the colors and gamma of the decoded h264 stream in a way that the video stream looks the same, regardless of the digitizing equipment used. * How do I use these options? If you are able to find out which characteristics your digitizing equipment uses, (see the equipment documentation or make reference measurements) then find the most suitable characteristics in the list of available characteristics (see H264 Annex E) and pass it to x264. Otherwise leave it to the default (unspecified). If you transcode from MPEG2, you may find the values for these options in the m2v bitstream. (see ITU-T Rec. H262 / ISO/IEC 13818-2 for details) * Should I use these options? Only if you know exactly what you are doing. The default setting is better than a wrong one. Use of this option is not a bad idea though. Unfortunately I don't know any decoder/ media player that ever even attempted color/gamma/color matrix correction. 6. Chroma Sample Location -------------------------- * What is it? A videophile setting. The average user won't ever notice a difference. Due to a weakness of the eye, it is often economic to reduce the number of chroma samples in a process called subsampling. In particular x264 uses only one chroma sample of each chroma channel every block of 2x2 luma samples. There are a number of possibilities on how this subsampling is done, each resulting in another relative location of the chroma sample towards the luma samples. The Chroma Sample Location matters when the subsampling process is reversed, e.g. the number of chroma samples is increased. This is most likely to happen at color space conversions. If it is not done correctly the chroma values may appear shifted compared to the luma samples by at most 1 pixel, or strangely blurred. * How do I use this option? Because x264 does no subsampling, since it only accepts already subsampled input frames, you have to determine the method yourself. If you transcode from MPEG1 with proper subsampled 4:2:0, and don't do any color space conversion, you should set this option to 1. If you transcode from MPEG2 with proper subsampled 4:2:0, and don't do any color space conversion, you should set this option to 0. If you transcode from MPEG4 with proper subsampled 4:2:0, and don't do any color space conversion, you should set this option to 0. If you do the color space conversion yourself this isn't that easy. If the filter kernel of the subsampling is ( 0.5, 0.5 ) in one direction then the chroma sample location in that direction is between the two luma samples. If your filter kernel is ( 0.25, 0.5, 0.25 ) in one direction then the chroma sample location in that direction is equal to one of the luma samples. H264 Annex E contains images that tell you how to "transform" your Chroma Sample Location into a value of 0 to 5 that you can pass to x264. * Should I use this option? Unless you are a perfectionist, don't bother. Media players ignore this setting, and favor their own (fixed) assumed Chroma Sample Location. x264-master/encoder/000077500000000000000000000000001502133446700144765ustar00rootroot00000000000000x264-master/encoder/analyse.c000066400000000000000000005020641502133446700163050ustar00rootroot00000000000000/***************************************************************************** * analyse.c: macroblock analysis ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #include "me.h" #include "ratecontrol.h" #include "analyse.h" #include "rdo.c" typedef struct { x264_me_t me16x16; x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */ x264_me_t me8x8[4]; x264_me_t me4x4[4][4]; x264_me_t me8x4[4][2]; x264_me_t me4x8[4][2]; x264_me_t me16x8[2]; x264_me_t me8x16[2]; int i_rd16x16; int i_cost8x8; int i_cost4x4[4]; /* cost per 8x8 partition */ int i_cost8x4[4]; /* cost per 8x8 partition */ int i_cost4x8[4]; /* cost per 8x8 partition */ int i_cost16x8; int i_cost8x16; /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3], [ref][5] is for alignment */ ALIGNED_8( int16_t mvc[32][6][2] ); } x264_mb_analysis_list_t; typedef struct { /* conduct the analysis using this lamda and QP */ int i_lambda; int i_lambda2; int i_qp; uint16_t *p_cost_mv; uint16_t *p_cost_ref[2]; int i_mbrd; /* I: Intra part */ /* Take some shortcuts in intra search if intra is deemed unlikely */ int b_fast_intra; int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */ int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */ int b_try_skip; /* Luma part */ int i_satd_i16x16; int i_satd_i16x16_dir[7]; int i_predict16x16; int i_satd_i8x8; int i_cbp_i8x8_luma; ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] ); int i_predict8x8[4]; int i_satd_i4x4; int i_predict4x4[16]; int i_satd_pcm; /* Chroma part */ int i_satd_chroma; int i_satd_chroma_dir[7]; int i_predict8x8chroma; /* II: Inter part P/B frame */ x264_mb_analysis_list_t l0; x264_mb_analysis_list_t l1; int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */ int i_cost16x16direct; int i_cost8x8bi; int i_cost8x8direct[4]; int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */ int i_cost_est16x8[2]; /* Per-partition estimated cost */ int i_cost_est8x16[2]; int i_cost16x8bi; int i_cost8x16bi; int i_rd16x16bi; int i_rd16x16direct; int i_rd16x8bi; int i_rd8x16bi; int i_rd8x8bi; int i_mb_partition16x8[2]; /* mb_partition_e */ int i_mb_partition8x16[2]; int i_mb_type16x8; /* mb_class_e */ int i_mb_type8x16; int b_direct_available; int b_early_terminate; } x264_mb_analysis_t; /* TODO: calculate CABAC costs */ static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = { 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 }; static const uint8_t i_mb_b16x8_cost_table[17] = { 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9 }; static const uint8_t i_sub_mb_b_cost_table[13] = { 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1 }; static const uint8_t i_sub_mb_p_cost_table[4] = { 5, 3, 3, 1 }; static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); static int init_costs( x264_t *h, float *logs, int qp ) { if( h->cost_mv[qp] ) return 0; int mv_range = h->param.analyse.i_mv_range << PARAM_INTERLACED; int lambda = x264_lambda_tab[qp]; /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) ); h->cost_mv[qp] += 2*4*mv_range; for( int i = 0; i <= 2*4*mv_range; i++ ) { h->cost_mv[qp][-i] = h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX ); } for( int i = 0; i < 3; i++ ) for( int j = 0; j < 33; j++ ) h->cost_table->ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0; if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] ) { for( int j = 0; j < 4; j++ ) { CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) ); h->cost_mv_fpel[qp][j] += 2*mv_range; for( int i = -2*mv_range; i < 2*mv_range; i++ ) h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j]; } } uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[qp]; for( int i = 0; i < 17; i++ ) cost_i4x4_mode[i] = 3*lambda*(i!=8); return 0; fail: return -1; } int x264_analyse_init_costs( x264_t *h ) { int mv_range = h->param.analyse.i_mv_range << PARAM_INTERLACED; float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) ); if( !logs ) return -1; logs[0] = 0.718f; for( int i = 1; i <= 2*4*mv_range; i++ ) logs[i] = log2f( i+1 ) * 2.0f + 1.718f; for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ ) if( init_costs( h, logs, qp ) ) goto fail; if( init_costs( h, logs, X264_LOOKAHEAD_QP ) ) goto fail; x264_free( logs ); return 0; fail: x264_free( logs ); return -1; } void x264_analyse_free_costs( x264_t *h ) { int mv_range = h->param.analyse.i_mv_range << PARAM_INTERLACED; for( int i = 0; i < QP_MAX+1; i++ ) { if( h->cost_mv[i] ) x264_free( h->cost_mv[i] - 2*4*mv_range ); for( int j = 0; j < 4; j++ ) { if( h->cost_mv_fpel[i][j] ) x264_free( h->cost_mv_fpel[i][j] - 2*mv_range ); } } } void x264_analyse_weight_frame( x264_t *h, int end ) { for( int j = 0; j < h->i_ref[0]; j++ ) { if( h->sh.weight[j][0].weightfn ) { x264_frame_t *frame = h->fref[0][j]; int width = frame->i_width[0] + PADH2; int i_padv = PADV << PARAM_INTERLACED; int offset, height; pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH_ALIGN; height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted; offset = h->fenc->i_lines_weighted*frame->i_stride[0]; h->fenc->i_lines_weighted += height; if( height ) for( int k = j; k < h->i_ref[0]; k++ ) if( h->sh.weight[k][0].weightfn ) { pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN; x264_weight_scale_plane( h, dst + offset, frame->i_stride[0], src + offset, frame->i_stride[0], width, height, &h->sh.weight[k][0] ); } break; } } } /* initialize an array of lambda*nbits for all possible mvs */ static void mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) { a->p_cost_mv = h->cost_mv[a->i_qp]; a->p_cost_ref[0] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; a->p_cost_ref[1] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; } static void mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp ) { int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 ); a->i_lambda = x264_lambda_tab[qp]; a->i_lambda2 = x264_lambda2_tab[qp]; h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd; if( h->param.analyse.i_trellis ) { h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp]; h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp]; h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp]; h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp]; } h->mb.i_psy_rd_lambda = a->i_lambda; /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */ int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET ); h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256; if( qp > QP_MAX_SPEC ) { h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1]; h->nr_residual_sum = h->nr_residual_sum_buf[1]; h->nr_count = h->nr_count_buf[1]; h->mb.b_noise_reduction = 1; qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */ } else { h->nr_offset = h->nr_offset_denoise; h->nr_residual_sum = h->nr_residual_sum_buf[0]; h->nr_count = h->nr_count_buf[0]; h->mb.b_noise_reduction = 0; } a->i_qp = h->mb.i_qp = qp; h->mb.i_chroma_qp = h->chroma_qp_table[qp]; } static void mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) { int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B); /* mbrd == 1 -> RD mode decision */ /* mbrd == 2 -> RD refinement */ /* mbrd == 3 -> QPRD */ a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10); h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1; a->b_early_terminate = h->param.analyse.i_subpel_refine < 11; mb_analyse_init_qp( h, a, qp ); h->mb.b_transform_8x8 = 0; /* I: Intra part */ a->i_satd_i16x16 = a->i_satd_i8x8 = a->i_satd_i4x4 = COST_MAX; a->i_satd_chroma = CHROMA_FORMAT ? COST_MAX : 0; /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it. * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */ uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8; a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX; a->b_fast_intra = 0; a->b_avoid_topright = 0; h->mb.i_skip_intra = h->mb.b_lossless ? 0 : a->i_mbrd ? 2 : !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction; /* II: Inter part P/B frame */ if( h->sh.i_type != SLICE_TYPE_I ) { int i_fmv_range = 4 * h->param.analyse.i_mv_range; // limit motion search to a slightly smaller range than the theoretical limit, // since the search may go a few iterations past its given range int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel /* Calculate max allowed MV range */ h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 ); h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range ); h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 ); if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) { int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */ int max_mv = max_x - 4*16*h->mb.i_mb_x; /* If we're left of the refresh bar, don't reference right of it. */ if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col ) h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv ); } h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) { int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; int thread_mvy_range = i_fmv_range; if( h->i_thread_frames > 1 ) { int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16; int thresh = pix_y + h->param.analyse.i_mv_range_thread; for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- ) for( int j = 0; j < h->i_ref[i]; j++ ) { int completed = x264_frame_cond_wait( h->fref[i][j]->orig, thresh ); thread_mvy_range = X264_MIN( thread_mvy_range, completed - pix_y ); } if( h->param.b_deterministic ) thread_mvy_range = h->param.analyse.i_mv_range_thread; if( PARAM_INTERLACED ) thread_mvy_range >>= 1; x264_analyse_weight_frame( h, pix_y + thread_mvy_range ); } if( PARAM_INTERLACED ) { /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */ for( int i = 0; i < 3; i++ ) { int j = i == 2; mb_y = (h->mb.i_mb_y >> j) + (i == 1); h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 ); h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 ); h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range ); h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range ); h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border; h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border; } } else { h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 ); h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range ); h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range ); h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; } } if( PARAM_INTERLACED ) { int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1; h->mb.mv_min[1] = h->mb.mv_miny_row[i]; h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; } a->l0.me16x16.cost = a->l0.i_rd16x16 = a->l0.i_cost8x8 = a->l0.i_cost16x8 = a->l0.i_cost8x16 = COST_MAX; if( h->sh.i_type == SLICE_TYPE_B ) { a->l1.me16x16.cost = a->l1.i_rd16x16 = a->l1.i_cost8x8 = a->i_cost8x8direct[0] = a->i_cost8x8direct[1] = a->i_cost8x8direct[2] = a->i_cost8x8direct[3] = a->l1.i_cost16x8 = a->l1.i_cost8x16 = a->i_rd16x16bi = a->i_rd16x16direct = a->i_rd8x8bi = a->i_rd16x8bi = a->i_rd8x16bi = a->i_cost16x16bi = a->i_cost16x16direct = a->i_cost8x8bi = a->i_cost16x8bi = a->i_cost8x16bi = COST_MAX; } else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) for( int i = 0; i < 4; i++ ) { a->l0.i_cost4x4[i] = a->l0.i_cost8x4[i] = a->l0.i_cost4x8[i] = COST_MAX; } /* Fast intra decision */ if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 ) { if( IS_INTRA( h->mb.i_mb_type_left[0] ) || IS_INTRA( h->mb.i_mb_type_top ) || IS_INTRA( h->mb.i_mb_type_topleft ) || IS_INTRA( h->mb.i_mb_type_topright ) || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16] + h->stat.frame.i_mb_count[I_PCM])) ) { /* intra is likely */ } else { a->b_fast_intra = 1; } } h->mb.b_skip_mc = 0; if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P && h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col ) { a->b_force_intra = 1; a->b_fast_intra = 0; a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col; } else a->b_force_intra = 0; } } /* Prediction modes allowed for various combinations of neighbors. */ /* Terminated by a -1. */ /* In order, no neighbors, left, top, top/left, top/left/topleft */ static const int8_t i16x16_mode_available[5][5] = { {I_PRED_16x16_DC_128, -1, -1, -1, -1}, {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1}, {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1}, {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1}, {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1}, }; static const int8_t chroma_mode_available[5][5] = { {I_PRED_CHROMA_DC_128, -1, -1, -1, -1}, {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1}, {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1}, {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1}, {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1}, }; static const int8_t i8x8_mode_available[2][5][10] = { { {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1}, {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1}, }, { {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, } }; static const int8_t i4x4_mode_available[2][5][10] = { { {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1}, {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1}, }, { {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1}, {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1}, } }; static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour ) { int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); return i16x16_mode_available[idx]; } static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour ) { int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); return chroma_mode_available[idx]; } static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i ) { int avoid_topright = force_intra && (i&1); int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); return i8x8_mode_available[avoid_topright][idx]; } static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i ) { int avoid_topright = force_intra && ((i&5) == 5); int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); return i4x4_mode_available[avoid_topright][idx]; } /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */ static inline void psy_trellis_init( x264_t *h, int do_both_dct ) { if( do_both_dct || h->mb.b_transform_8x8 ) h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], (pixel*)x264_zero ); if( do_both_dct || !h->mb.b_transform_8x8 ) h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], (pixel*)x264_zero ); } /* Reset fenc satd scores cache for psy RD */ static inline void mb_init_fenc_cache( x264_t *h, int b_satd ) { if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); if( !h->mb.i_psy_rd ) return; M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO; M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO; M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO; M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO; h->mb.pic.fenc_hadamard_cache[8] = 0; if( b_satd ) h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) ); } static void mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) { if( a->i_satd_chroma < COST_MAX ) return; if( CHROMA444 ) { if( !h->mb.b_chroma_me ) { a->i_satd_chroma = 0; return; } /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */ if( h->mb.b_lossless ) { x264_predict_lossless_16x16( h, 1, a->i_predict16x16 ); x264_predict_lossless_16x16( h, 2, a->i_predict16x16 ); } else { h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] ); h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] ); } a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); return; } const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; /* Prediction selection for chroma */ if( predict_mode[3] >= 0 && !h->mb.b_lossless ) { int satdu[4], satdv[4]; h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu ); h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ); satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); for( ; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode ); a->i_satd_chroma_dir[i_mode] = i_satd; COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode ); } } else { for( ; *predict_mode >= 0; predict_mode++ ) { int i_satd; int i_mode = *predict_mode; /* we do the prediction */ if( h->mb.b_lossless ) x264_predict_lossless_chroma( h, i_mode ); else { h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } /* we calculate the cost */ i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) + a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] ); a->i_satd_chroma_dir[i_mode] = i_satd; COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode ); } } h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; } /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */ static void mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) { const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter; pixel *p_src = h->mb.pic.p_fenc[0]; pixel *p_dst = h->mb.pic.p_fdec[0]; static const int8_t intra_analysis_shortcut[2][2][2][5] = { {{{I_PRED_4x4_HU, -1, -1, -1, -1}, {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}}, {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1}, {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}}, {{{I_PRED_4x4_HU, -1, -1, -1, -1}, {-1, -1, -1, -1, -1}}, {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1}, {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}}, }; int idx; int lambda = a->i_lambda; /*---------------- Try all mode and calculate their score ---------------*/ /* Disabled i16x16 for AVC-Intra compat */ if( !h->param.i_avcintra_class ) { const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra ); /* Not heavily tuned */ static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 }; int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX; if( !h->mb.b_lossless && predict_mode[3] >= 0 ) { h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0); a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1); a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2); COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 ); COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 ); COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 ); /* Plane is expensive, so don't check it unless one of the previous modes was useful. */ if( a->i_satd_i16x16 <= i16x16_thresh ) { h->predict_16x16[I_PRED_16x16_P]( p_dst ); a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ); a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3); COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 ); } } else { for( ; *predict_mode >= 0; predict_mode++ ) { int i_satd; int i_mode = *predict_mode; if( h->mb.b_lossless ) x264_predict_lossless_16x16( h, 0, i_mode ); else h->predict_16x16[i_mode]( p_dst ); i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) + lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode ); a->i_satd_i16x16_dir[i_mode] = i_satd; } } if( h->sh.i_type == SLICE_TYPE_B ) /* cavlc mb type prefix */ a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16]; if( a->i_satd_i16x16 > i16x16_thresh ) return; } uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[a->i_qp] + 8; /* 8x8 prediction selection */ if( flags & X264_ANALYSE_I8x8 ) { ALIGNED_ARRAY_32( pixel, edge,[36] ); x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); // FIXME some bias like in i4x4? int i_cost = lambda * 4; /* base predmode costs */ h->mb.i_cbp_luma = 0; if( h->sh.i_type == SLICE_TYPE_B ) i_cost += lambda * i_mb_b_cost_table[I_8x8]; for( idx = 0;; idx++ ) { int x = idx&1; int y = idx>>1; pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE; pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE; int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx ); h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 ) { /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */ i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] ); i_cost += i_best & 0xffff; i_best >>= 16; a->i_predict8x8[idx] = i_best; if( idx == 3 || i_cost > i_satd_thresh ) break; x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best ); } else { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { ALIGNED_ARRAY_16( int32_t, satd,[4] ); h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; if( i_pred_mode < 3 ) satd[i_pred_mode] -= 3 * lambda; for( int i = 2; i >= 0; i-- ) { int cost = satd[i]; a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda; COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i ); } /* Take analysis shortcuts: don't analyse modes that are too * far away direction-wise from the favored mode. */ if( a->i_mbrd < 1 + a->b_fast_intra ) predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; else predict_mode += 3; } for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ ) { int i_satd; int i_mode = *predict_mode; if( h->mb.b_lossless ) x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge ); else h->predict_8x8[i_mode]( p_dst_by, edge ); i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) i_satd -= 3 * lambda; COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode ); a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda; } i_cost += i_best + 3*lambda; if( idx == 3 || i_cost > i_satd_thresh ) break; if( h->mb.b_lossless ) x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge ); else h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge ); x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); } /* we need to encode this block now (for next ones) */ x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 ); } if( idx == 3 ) { a->i_satd_i8x8 = i_cost; if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) ); } } else { static const uint16_t cost_div_fix8[3] = {1024,512,341}; a->i_satd_i8x8 = COST_MAX; i_cost = (i_cost * cost_div_fix8[idx]) >> 8; } /* Not heavily tuned */ static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 }; if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 ) return; } /* 4x4 prediction selection */ if( flags & X264_ANALYSE_I4x4 ) { int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */ int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX; h->mb.i_cbp_luma = 0; if( a->b_early_terminate && a->i_mbrd ) i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; if( h->sh.i_type == SLICE_TYPE_B ) i_cost += lambda * i_mb_b_cost_table[I_4x4]; for( idx = 0;; idx++ ) { pixel *p_src_by = p_src + block_idx_xy_fenc[idx]; pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx]; int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] ); if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 ) { /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */ i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode ); i_cost += i_best & 0xffff; i_best >>= 16; a->i_predict4x4[idx] = i_best; if( i_cost > i_satd_thresh || idx == 15 ) break; h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best; } else { if( !h->mb.b_lossless && predict_mode[5] >= 0 ) { ALIGNED_ARRAY_16( int32_t, satd,[4] ); h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; if( i_pred_mode < 3 ) satd[i_pred_mode] -= 3 * lambda; i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC; COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H ); COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V ); /* Take analysis shortcuts: don't analyse modes that are too * far away direction-wise from the favored mode. */ if( a->i_mbrd < 1 + a->b_fast_intra ) predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; else predict_mode += 3; } if( i_best > 0 ) { for( ; *predict_mode >= 0; predict_mode++ ) { int i_satd; int i_mode = *predict_mode; if( h->mb.b_lossless ) x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode ); else h->predict_4x4[i_mode]( p_dst_by ); i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE ); if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) { i_satd -= lambda * 3; if( i_satd <= 0 ) { i_best = i_satd; a->i_predict4x4[idx] = i_mode; break; } } COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); } } i_cost += i_best + 3 * lambda; if( i_cost > i_satd_thresh || idx == 15 ) break; if( h->mb.b_lossless ) x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] ); else h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by ); h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx]; } /* we need to encode this block now (for next ones) */ x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 ); } if( idx == 15 ) { a->i_satd_i4x4 = i_cost; if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) ); } } else a->i_satd_i4x4 = COST_MAX; } } static void intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh ) { if( !a->b_early_terminate ) i_satd_thresh = COST_MAX; if( a->i_satd_i16x16 < i_satd_thresh ) { h->mb.i_type = I_16x16; analyse_update_cache( h, a ); a->i_satd_i16x16 = rd_cost_mb( h, a->i_lambda2 ); } else a->i_satd_i16x16 = COST_MAX; if( a->i_satd_i4x4 < i_satd_thresh ) { h->mb.i_type = I_4x4; analyse_update_cache( h, a ); a->i_satd_i4x4 = rd_cost_mb( h, a->i_lambda2 ); } else a->i_satd_i4x4 = COST_MAX; if( a->i_satd_i8x8 < i_satd_thresh ) { h->mb.i_type = I_8x8; analyse_update_cache( h, a ); a->i_satd_i8x8 = rd_cost_mb( h, a->i_lambda2 ); a->i_cbp_i8x8_luma = h->mb.i_cbp_luma; } else a->i_satd_i8x8 = COST_MAX; } static void intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { uint64_t i_satd, i_best; int plane_count = CHROMA444 ? 3 : 1; h->mb.i_skip_intra = 0; if( h->mb.i_type == I_16x16 ) { int old_pred_mode = a->i_predict16x16; const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra ); int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX; i_best = a->i_satd_i16x16; for( ; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh ) continue; h->mb.i_intra16x16_pred_mode = i_mode; i_satd = rd_cost_mb( h, a->i_lambda2 ); COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode ); } } /* RD selection for chroma prediction */ if( CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422 ) { const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra ); if( predict_mode[1] >= 0 ) { int8_t predict_mode_sorted[4]; int i_max; int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX; for( i_max = 0; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma ) predict_mode_sorted[i_max++] = i_mode; } if( i_max > 0 ) { int i_cbp_chroma_best = h->mb.i_cbp_chroma; int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp]; /* the previous thing encoded was intra_rd(), so the pixels and * coefs for the current chroma mode are still around, so we only * have to recount the bits. */ i_best = rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); for( int i = 0; i < i_max; i++ ) { int i_mode = predict_mode_sorted[i]; if( h->mb.b_lossless ) x264_predict_lossless_chroma( h, i_mode ); else { h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } /* if we've already found a mode that needs no residual, then * probably any mode with a residual will be worse. * so avoid dct on the remaining modes to improve speed. */ i_satd = rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 ); COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma ); } h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; h->mb.i_cbp_chroma = i_cbp_chroma_best; } } } if( h->mb.i_type == I_4x4 ) { pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning int nnz[3] = {0}; for( int idx = 0; idx < 16; idx++ ) { pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx], CHROMA_FORMAT ? h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx] : NULL, CHROMA_FORMAT ? h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx] : NULL}; i_best = COST_MAX64; const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) for( int p = 0; p < plane_count; p++ ) /* emulate missing topright samples */ MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] ); for( ; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; i_satd = rd_cost_i4x4( h, a->i_lambda2, idx, i_mode ); if( i_best > i_satd ) { a->i_predict4x4[idx] = i_mode; i_best = i_satd; for( int p = 0; p < plane_count; p++ ) { pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE ); pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE ); pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE ); pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE ); nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]]; } } } for( int p = 0; p < plane_count; p++ ) { MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0]; MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1]; MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2]; MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3]; h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p]; } h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx]; } } else if( h->mb.i_type == I_8x8 ) { ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap pixel4 pels_h[3][2] = {{0}}; pixel pels_v[3][7] = {{0}}; uint16_t nnz[3][2] = {{0}}; //shut up gcc for( int idx = 0; idx < 4; idx++ ) { int x = idx&1; int y = idx>>1; int s8 = X264_SCAN8_0 + 2*x + 16*y; pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE, CHROMA_FORMAT ? h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE : NULL, CHROMA_FORMAT ? h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE : NULL}; int cbp_luma_new = 0; int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX; i_best = COST_MAX64; const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx ); for( int p = 0; p < plane_count; p++ ) h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); for( ; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh ) continue; h->mb.i_cbp_luma = a->i_cbp_i8x8_luma; i_satd = rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge ); if( i_best > i_satd ) { a->i_predict8x8[idx] = i_mode; cbp_luma_new = h->mb.i_cbp_luma; i_best = i_satd; for( int p = 0; p < plane_count; p++ ) { pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ); pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ); if( !(idx&1) ) for( int j = 0; j < 7; j++ ) pels_v[p][j] = dst[p][7+j*FDEC_STRIDE]; nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ); nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ); } } } a->i_cbp_i8x8_luma = cbp_luma_new; for( int p = 0; p < plane_count; p++ ) { MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0]; MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1]; if( !(idx&1) ) for( int j = 0; j < 7; j++ ) dst[p][7+j*FDEC_STRIDE] = pels_v[p][j]; M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0]; M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1]; } x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); } } } #define LOAD_FENC(m, src, xoff, yoff) \ { \ (m)->p_cost_mv = a->p_cost_mv; \ (m)->i_stride[0] = h->mb.pic.i_stride[0]; \ (m)->i_stride[1] = h->mb.pic.i_stride[1]; \ (m)->i_stride[2] = h->mb.pic.i_stride[2]; \ (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \ if( CHROMA_FORMAT ) \ { \ (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \ (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \ } \ } #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \ { \ (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \ if( h->param.analyse.i_subpel_refine ) \ { \ (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ } \ if( CHROMA444 ) \ { \ (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \ (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \ if( h->param.analyse.i_subpel_refine ) \ { \ (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \ (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \ (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \ (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \ (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \ (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \ } \ } \ else if( CHROMA_FORMAT ) \ (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \ if( h->param.analyse.i_me_method >= X264_ME_ESA ) \ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = x264_weight_none; \ (m)->i_ref = ref; \ } #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \ (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = h->sh.weight[i_ref]; #define REF_COST(list, ref) \ (a->p_cost_ref[list][ref]) static void mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; int i_mvc; ALIGNED_ARRAY_8( int16_t, mvc,[8],[2] ); int i_halfpel_thresh = INT_MAX; int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL; /* 16x16 Search on all ref frame */ m.i_pixel = PIXEL_16x16; LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); a->l0.me16x16.cost = INT_MAX; for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ ) { m.i_ref_cost = REF_COST( 0, i_ref ); i_halfpel_thresh -= m.i_ref_cost; /* search with ref */ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 ); LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); if( h->mb.ref_blind_dupe == i_ref ) { CP32( m.mv, a->l0.mvc[0][0] ); x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh ); } else { x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); } /* save mv for predicting neighbors */ CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv ); CP32( a->l0.mvc[i_ref][0], m.mv ); /* early termination * SSD threshold would probably be better than SATD */ if( i_ref == 0 && a->b_try_skip && m.cost-m.cost_mv < 300*a->i_lambda && abs(m.mv[0]-h->mb.cache.pskip_mv[0]) + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1 && x264_macroblock_probe_pskip( h ) ) { h->mb.i_type = P_SKIP; analyse_update_cache( h, a ); assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); return; } m.cost += m.i_ref_cost; i_halfpel_thresh += m.i_ref_cost; if( m.cost < a->l0.me16x16.cost ) h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); } x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); h->mb.i_type = P_L0; if( a->i_mbrd ) { mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ); if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra ) { h->mb.i_partition = D_16x16; x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) ) h->mb.i_type = P_SKIP; } } } static void mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; pixel **p_fenc = h->mb.pic.p_fenc; int i_maxref = h->mb.pic.i_fref[0]-1; h->mb.i_partition = D_8x8; #define CHECK_NEIGHBOUR(i)\ {\ int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\ if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\ i_maxref = ref;\ } /* early termination: if 16x16 chose ref 0, then evaluate no refs older * than those used by the neighbors */ if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) && h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) ) { i_maxref = 0; CHECK_NEIGHBOUR( -8 - 1 ); CHECK_NEIGHBOUR( -8 + 0 ); CHECK_NEIGHBOUR( -8 + 2 ); CHECK_NEIGHBOUR( -8 + 4 ); CHECK_NEIGHBOUR( 0 - 1 ); CHECK_NEIGHBOUR( 2*8 - 1 ); } #undef CHECK_NEIGHBOUR for( int i_ref = 0; i_ref <= i_maxref; i_ref++ ) CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] ); for( int i = 0; i < 4; i++ ) { x264_me_t *l0m = &a->l0.me8x8[i]; int x8 = i&1; int y8 = i>>1; m.i_pixel = PIXEL_8x8; LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 ); l0m->cost = INT_MAX; for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; ) { m.i_ref_cost = REF_COST( 0, i_ref ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); if( h->mb.ref_blind_dupe == i_ref ) { CP32( m.mv, a->l0.mvc[0][i+1] ); x264_me_refine_qpel_refdupe( h, &m, NULL ); } else x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 ); m.cost += m.i_ref_cost; CP32( a->l0.mvc[i_ref][i+1], m.mv ); if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe ) i_ref = h->mb.ref_blind_dupe; else i_ref++; } x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost ); /* If CABAC is on and we're not doing sub-8x8 analysis, the costs are effectively zero. */ if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; } a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost + a->l0.me8x8[2].cost + a->l0.me8x8[3].cost; /* P_8x8 ref0 has no ref cost */ if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref | a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) ) a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4; M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; } static void mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) { /* Duplicate refs are rarely useful in p8x8 due to the high cost of the * reference frame flags. Thus, if we're not doing mixedrefs, just * don't bother analysing the dupes. */ const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref; const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0; pixel **p_fenc = h->mb.pic.p_fenc; int i_mvc; int16_t (*mvc)[2] = a->l0.mvc[i_ref]; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; i_mvc = 1; CP32( mvc[0], a->l0.me16x16.mv ); for( int i = 0; i < 4; i++ ) { x264_me_t *m = &a->l0.me8x8[i]; int x8 = i&1; int y8 = i>>1; m->i_pixel = PIXEL_8x8; m->i_ref_cost = i_ref_cost; LOAD_FENC( m, p_fenc, 8*x8, 8*y8 ); LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 ); x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); x264_me_search( h, m, mvc, i_mvc ); x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv ); CP32( mvc[i_mvc], m->mv ); i_mvc++; a->i_satd8x8[0][i] = m->cost - m->cost_mv; /* mb type cost */ m->cost += i_ref_cost; if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; } a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost + a->l0.me8x8[2].cost + a->l0.me8x8[3].cost; /* theoretically this should include 4*ref_cost, * but 3 seems a better approximation of cabac. */ if( h->param.b_cabac ) a->l0.i_cost8x8 -= i_ref_cost; M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; } static void mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { x264_me_t m; pixel **p_fenc = h->mb.pic.p_fenc; ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_16x8; for( int i = 0; i < 2; i++ ) { x264_me_t *l0m = &a->l0.me16x8[i]; const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref ); const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref ); const int ref8[2] = { minref, maxref }; const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; m.i_pixel = PIXEL_16x8; LOAD_FENC( &m, p_fenc, 0, 8*i ); l0m->cost = INT_MAX; for( int j = 0; j < i_ref8s; j++ ) { const int i_ref = ref8[j]; m.i_ref_cost = REF_COST( 0, i_ref ); /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ CP32( mvc[0], a->l0.mvc[i_ref][0] ); CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] ); CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); /* We can only take this shortcut if the first search was performed on ref0. */ if( h->mb.ref_blind_dupe == i_ref && !ref8[0] ) { /* We can just leave the MV from the previous ref search. */ x264_me_refine_qpel_refdupe( h, &m, NULL ); } else x264_me_search( h, &m, mvc, 3 ); m.cost += m.i_ref_cost; if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); } /* Early termination based on the current SATD score of partition[0] plus the estimated SATD score of partition[1] */ if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) ) { a->l0.i_cost16x8 = COST_MAX; return; } x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref ); } a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost; } static void mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { x264_me_t m; pixel **p_fenc = h->mb.pic.p_fenc; ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x16; for( int i = 0; i < 2; i++ ) { x264_me_t *l0m = &a->l0.me8x16[i]; const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref ); const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref ); const int ref8[2] = { minref, maxref }; const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; m.i_pixel = PIXEL_8x16; LOAD_FENC( &m, p_fenc, 8*i, 0 ); l0m->cost = INT_MAX; for( int j = 0; j < i_ref8s; j++ ) { const int i_ref = ref8[j]; m.i_ref_cost = REF_COST( 0, i_ref ); CP32( mvc[0], a->l0.mvc[i_ref][0] ); CP32( mvc[1], a->l0.mvc[i_ref][i+1] ); CP32( mvc[2], a->l0.mvc[i_ref][i+3] ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); /* We can only take this shortcut if the first search was performed on ref0. */ if( h->mb.ref_blind_dupe == i_ref && !ref8[0] ) { /* We can just leave the MV from the previous ref search. */ x264_me_refine_qpel_refdupe( h, &m, NULL ); } else x264_me_search( h, &m, mvc, 3 ); m.cost += m.i_ref_cost; if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); } /* Early termination based on the current SATD score of partition[0] plus the estimated SATD score of partition[1] */ if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) ) { a->l0.i_cost8x16 = COST_MAX; return; } x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref ); } a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost; } static ALWAYS_INLINE int mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size, int chroma ) { ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); pixel *pix2 = pix1+8; int i_stride = h->mb.pic.i_stride[1]; int chroma_h_shift = chroma <= CHROMA_422; int chroma_v_shift = chroma == CHROMA_420; int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride; int i_ref = a->l0.me8x8[i8x8].i_ref; int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; // FIXME weight can be done on 4x4 blocks even if mc is smaller #define CHROMA4x4MC( width, height, me, x, y ) \ if( chroma == CHROMA_444 ) \ { \ int mvx = (me).mv[0] + 4*2*x; \ int mvy = (me).mv[1] + 4*2*y; \ h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \ mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \ h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \ mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \ } \ else \ { \ int offset = x + (2>>chroma_v_shift)*16*y; \ int chroma_height = (2>>chroma_v_shift)*height; \ h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \ (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \ if( weight[1].weightfn ) \ weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \ if( weight[2].weightfn ) \ weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \ } if( size == PIXEL_4x4 ) { x264_me_t *m = a->l0.me4x4[i8x8]; CHROMA4x4MC( 2,2, m[0], 0,0 ); CHROMA4x4MC( 2,2, m[1], 2,0 ); CHROMA4x4MC( 2,2, m[2], 0,2 ); CHROMA4x4MC( 2,2, m[3], 2,2 ); } else if( size == PIXEL_8x4 ) { x264_me_t *m = a->l0.me8x4[i8x8]; CHROMA4x4MC( 4,2, m[0], 0,0 ); CHROMA4x4MC( 4,2, m[1], 0,2 ); } else { x264_me_t *m = a->l0.me4x8[i8x8]; CHROMA4x4MC( 2,4, m[0], 0,0 ); CHROMA4x4MC( 2,4, m[1], 2,0 ); } #undef CHROMA4x4MC int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE; int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4; return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 ) + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 ); } static int mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size ) { if( CHROMA_FORMAT == CHROMA_444 ) return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 ); else if( CHROMA_FORMAT == CHROMA_422 ) return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 ); else return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 ); } static void mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; pixel **p_fenc = h->mb.pic.p_fenc; const int i_ref = a->l0.me8x8[i8x8].i_ref; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; for( int i4x4 = 0; i4x4 < 4; i4x4++ ) { const int idx = 4*i8x8 + i4x4; const int x4 = block_idx_x[idx]; const int y4 = block_idx_y[idx]; const int i_mvc = (i4x4 == 0); x264_me_t *m = &a->l0.me4x4[i8x8][i4x4]; m->i_pixel = PIXEL_4x4; LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc ); x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv ); } a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost + a->l0.me4x4[i8x8][1].cost + a->l0.me4x4[i8x8][2].cost + a->l0.me4x4[i8x8][3].cost + REF_COST( 0, i_ref ) + a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4]; if( h->mb.b_chroma_me && !CHROMA444 ) a->l0.i_cost4x4[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 ); } static void mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; pixel **p_fenc = h->mb.pic.p_fenc; const int i_ref = a->l0.me8x8[i8x8].i_ref; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; for( int i8x4 = 0; i8x4 < 2; i8x4++ ) { const int idx = 4*i8x8 + 2*i8x4; const int x4 = block_idx_x[idx]; const int y4 = block_idx_y[idx]; const int i_mvc = (i8x4 == 0); x264_me_t *m = &a->l0.me8x4[i8x8][i8x4]; m->i_pixel = PIXEL_8x4; LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); x264_mb_predict_mv( h, 0, idx, 2, m->mvp ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv ); } a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost + REF_COST( 0, i_ref ) + a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4]; if( h->mb.b_chroma_me && !CHROMA444 ) a->l0.i_cost8x4[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 ); } static void mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; pixel **p_fenc = h->mb.pic.p_fenc; const int i_ref = a->l0.me8x8[i8x8].i_ref; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; for( int i4x8 = 0; i4x8 < 2; i4x8++ ) { const int idx = 4*i8x8 + i4x8; const int x4 = block_idx_x[idx]; const int y4 = block_idx_y[idx]; const int i_mvc = (i4x8 == 0); x264_me_t *m = &a->l0.me4x8[i8x8][i4x8]; m->i_pixel = PIXEL_4x8; LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv ); } a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost + REF_COST( 0, i_ref ) + a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8]; if( h->mb.b_chroma_me && !CHROMA444 ) a->l0.i_cost4x8[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 ); } static ALWAYS_INLINE int analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel ) { ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] ); ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] ); int i_chroma_cost = 0; int chromapix = h->luma2chroma_pixel[i_pixel]; #define COST_BI_CHROMA( m0, m1, width, height ) \ { \ if( CHROMA444 ) \ { \ h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \ m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \ h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \ m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \ h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \ m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \ h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \ m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \ } \ else \ { \ int v_shift = CHROMA_V_SHIFT; \ int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \ m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \ h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \ m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \ } \ h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \ + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \ } if( i_pixel == PIXEL_16x16 ) COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 ) else if( i_pixel == PIXEL_16x8 ) COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 ) else if( i_pixel == PIXEL_8x16 ) COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 ) else COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 ) return i_chroma_cost; } static void mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) { /* Assumes that fdec still contains the results of * x264_mb_predict_mv_direct16x16 and x264_mb_mc */ pixel *p_fenc = h->mb.pic.p_fenc[0]; pixel *p_fdec = h->mb.pic.p_fdec[0]; a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT]; if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 ) { int chromapix = h->luma2chroma_pixel[PIXEL_8x8]; for( int i = 0; i < 4; i++ ) { const int x = (i&1)*8; const int y = (i>>1)*8; a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE ); if( h->mb.b_chroma_me ) { int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE; int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE; a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE, &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE ) + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE, &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE ); } a->i_cost16x16direct += a->i_cost8x8direct[i]; /* mb type cost */ a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8]; } } else { a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE ); if( h->mb.b_chroma_me ) { int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); } } } static void mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { ALIGNED_ARRAY_32( pixel, pix0,[16*16] ); ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); pixel *src0, *src1; intptr_t stride0 = 16, stride1 = 16; int i_ref, i_mvc; ALIGNED_ARRAY_8( int16_t, mvc,[9],[2] ); int try_skip = a->b_try_skip; int list1_skipped = 0; int i_halfpel_thresh[2] = {INT_MAX, INT_MAX}; int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL, (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL}; x264_me_t m; m.i_pixel = PIXEL_16x16; LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); /* 16x16 Search on list 0 and list 1 */ a->l0.me16x16.cost = INT_MAX; a->l1.me16x16.cost = INT_MAX; for( int l = 1; l >= 0; ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; /* This loop is extremely munged in order to facilitate the following order of operations, * necessary for an efficient fast skip. * 1. Search list1 ref0. * 2. Search list0 ref0. * 3. Try skip. * 4. Search the rest of list0. * 5. Go back and finish list1. */ for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ ) { if( try_skip && l == 1 && i_ref > 0 ) { list1_skipped = 1; break; } m.i_ref_cost = REF_COST( l, i_ref ); /* search with ref */ LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 ); x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp ); x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc ); x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] ); /* add ref cost */ m.cost += m.i_ref_cost; if( m.cost < lX->me16x16.cost ) h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) ); /* save mv for predicting neighbors */ CP32( lX->mvc[i_ref][0], m.mv ); CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv ); /* Fast skip detection. */ if( i_ref == 0 && try_skip ) { if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) + abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 ) { try_skip = 0; } else if( !l ) { /* We already tested skip */ h->mb.i_type = B_SKIP; analyse_update_cache( h, a ); return; } } } if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] ) break; if( list1_skipped && l == 0 ) l = 1; else l--; } /* get cost of BI mode */ h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) ); h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) ); int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref ); src0 = h->mc.get_ref( pix0, &stride0, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0], a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none ); src1 = h->mc.get_ref( pix1, &stride1, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0], a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none ); h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) + ref_costs + a->l0.bi16x16.cost_mv + a->l1.bi16x16.cost_mv; if( h->mb.b_chroma_me ) a->i_cost16x16bi += analyse_bi_chroma( h, a, 0, PIXEL_16x16 ); /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */ if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) ) { int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]] + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]]; int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]] + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]]; h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0], h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) + ref_costs + l0_mv_cost + l1_mv_cost; if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi ) { ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] ); if( CHROMA444 ) { h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE ); h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2], h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE ); } else { ALIGNED_ARRAY_64( pixel, pixuv, [2],[16*FENC_STRIDE] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int v_shift = CHROMA_V_SHIFT; if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref ) { int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 ); } else h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 16>>v_shift ); if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref ) { int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 ); } else h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 16>>v_shift ); h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE ) + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE ); } } if( cost00 < a->i_cost16x16bi ) { M32( a->l0.bi16x16.mv ) = 0; M32( a->l1.bi16x16.mv ) = 0; a->l0.bi16x16.cost_mv = l0_mv_cost; a->l1.bi16x16.cost_mv = l1_mv_cost; a->i_cost16x16bi = cost00; } } /* mb type cost */ a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI]; a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0]; a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1]; } static inline void mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i ) { int x = 2*(i&1); int y = i&2; switch( h->mb.i_sub_partition[i] ) { case D_L0_8x8: x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv ); break; case D_L0_8x4: x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv ); x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv ); break; case D_L0_4x8: x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv ); x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv ); break; case D_L0_4x4: x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv ); x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv ); x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv ); x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv ); break; default: x264_log( h, X264_LOG_ERROR, "internal error\n" ); break; } } static void mb_load_mv_direct8x8( x264_t *h, int idx ) { int x = 2*(idx&1); int y = idx&2; x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] ); x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] ); x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] ); x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] ); } #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \ if( x264_mb_partition_listX_table[0][part] ) \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \ x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \ } \ else \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \ x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \ if( b_mvd ) \ x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \ } \ if( x264_mb_partition_listX_table[1][part] ) \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \ x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \ } \ else \ { \ x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \ x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \ if( b_mvd ) \ x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \ } static inline void mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) { int x = 2*(i&1); int y = i&2; if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) { mb_load_mv_direct8x8( h, i ); if( b_mvd ) { x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 ); x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 ); x264_macroblock_cache_skip( h, x, y, 2, 2, 1 ); } } else { CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] ); } } static inline void mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) { CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] ); } static inline void mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) { CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] ); } #undef CACHE_MV_BI static void mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a ) { ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] ); int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1}; /* early termination: if 16x16 chose ref 0, then evaluate no refs older * than those used by the neighbors */ #define CHECK_NEIGHBOUR(i)\ {\ int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\ if( ref > i_maxref[l] )\ i_maxref[l] = ref;\ } for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 && h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 ) { i_maxref[l] = 0; CHECK_NEIGHBOUR( -8 - 1 ); CHECK_NEIGHBOUR( -8 + 0 ); CHECK_NEIGHBOUR( -8 + 2 ); CHECK_NEIGHBOUR( -8 + 4 ); CHECK_NEIGHBOUR( 0 - 1 ); CHECK_NEIGHBOUR( 2*8 - 1 ); } } /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; a->i_cost8x8bi = 0; for( int i = 0; i < 4; i++ ) { int x8 = i&1; int y8 = i>>1; int i_part_cost; int i_part_cost_bi; intptr_t stride[2] = {8,8}; pixel *src[2]; x264_me_t m; m.i_pixel = PIXEL_8x8; LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; lX->me8x8[i].cost = INT_MAX; for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ ) { m.i_ref_cost = REF_COST( l, i_ref ); LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 ); x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref ); x264_mb_predict_mv( h, l, 4*i, 2, m.mvp ); x264_me_search( h, &m, lX->mvc[i_ref], i+1 ); m.cost += m.i_ref_cost; if( m.cost < lX->me8x8[i].cost ) { h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) ); a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost ); } /* save mv for predicting other partitions within this MB */ CP32( lX->mvc[i_ref][i+1], m.mv ); } } /* BI mode */ src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0], a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none ); src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0], a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none ); h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] ); a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; if( h->mb.b_chroma_me ) { int i_chroma_cost = analyse_bi_chroma( h, a, i, PIXEL_8x8 ); i_part_cost_bi += i_chroma_cost; a->i_satd8x8[2][i] += i_chroma_cost; } a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; i_part_cost = a->l0.me8x8[i].cost; h->mb.i_sub_partition[i] = D_L0_8x8; COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 ); COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 ); COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 ); a->i_cost8x8bi += i_part_cost; /* XXX Needed for x264_mb_predict_mv */ mb_cache_mv_b8x8( h, a, i, 0 ); } /* mb type cost */ a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8]; } static void mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) { pixel **p_fref[2] = { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref], h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] }; ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] ); /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; a->i_cost8x8bi = 0; for( int i = 0; i < 4; i++ ) { int x8 = i&1; int y8 = i>>1; int i_part_cost; int i_part_cost_bi = 0; intptr_t stride[2] = {8,8}; pixel *src[2]; for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; x264_me_t *m = &lX->me8x8[i]; m->i_pixel = PIXEL_8x8; LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref ); m->i_ref = lX->me16x16.i_ref; LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 ); x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref ); x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); x264_me_search( h, m, &lX->me16x16.mv, 1 ); a->i_satd8x8[l][i] = m->cost - m->cost_mv; m->cost += m->i_ref_cost; x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv ); /* save mv for predicting other partitions within this MB */ CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv ); /* BI mode */ src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], m->mv[0], m->mv[1], 8, 8, x264_weight_none ); i_part_cost_bi += m->cost_mv + m->i_ref_cost; } h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] ); a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; if( h->mb.b_chroma_me ) { int i_chroma_cost = analyse_bi_chroma( h, a, i, PIXEL_8x8 ); i_part_cost_bi += i_chroma_cost; a->i_satd8x8[2][i] += i_chroma_cost; } i_part_cost = a->l0.me8x8[i].cost; h->mb.i_sub_partition[i] = D_L0_8x8; COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 ); COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 ); COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 ); a->i_cost8x8bi += i_part_cost; /* XXX Needed for x264_mb_predict_mv */ mb_cache_mv_b8x8( h, a, i, 0 ); } /* mb type cost */ a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8]; } static void mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] ); ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); h->mb.i_partition = D_16x8; a->i_cost16x8bi = 0; for( int i = 0; i < 2; i++ ) { int i_part_cost; int i_part_cost_bi = 0; intptr_t stride[2] = {16,16}; pixel *src[2]; x264_me_t m; m.i_pixel = PIXEL_16x8; LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i ); for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref }; int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; lX->me16x8[i].cost = INT_MAX; for( int j = 0; j < i_ref8s; j++ ) { int i_ref = ref8[j]; m.i_ref_cost = REF_COST( l, i_ref ); LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i ); CP32( mvc[0], lX->mvc[i_ref][0] ); CP32( mvc[1], lX->mvc[i_ref][2*i+1] ); CP32( mvc[2], lX->mvc[i_ref][2*i+2] ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref ); x264_mb_predict_mv( h, l, 8*i, 4, m.mvp ); x264_me_search( h, &m, mvc, 3 ); m.cost += m.i_ref_cost; if( m.cost < lX->me16x8[i].cost ) h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) ); } } /* BI mode */ src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0], a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none ); src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0], a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none ); h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] ); i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ) + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost + a->l1.me16x8[i].i_ref_cost; if( h->mb.b_chroma_me ) i_part_cost_bi += analyse_bi_chroma( h, a, i, PIXEL_16x8 ); i_part_cost = a->l0.me16x8[i].cost; a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */ if( a->l1.me16x8[i].cost < i_part_cost ) { i_part_cost = a->l1.me16x8[i].cost; a->i_mb_partition16x8[i] = D_L1_8x8; } if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost ) { i_part_cost = i_part_cost_bi; a->i_mb_partition16x8[i] = D_BI_8x8; } a->i_cost16x8bi += i_part_cost; /* Early termination based on the current SATD score of partition[0] plus the estimated SATD score of partition[1] */ if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) ) { a->i_cost16x8bi = COST_MAX; return; } mb_cache_mv_b16x8( h, a, i, 0 ); } /* mb type cost */ a->i_mb_type16x8 = B_L0_L0 + (a->i_mb_partition16x8[0]>>2) * 3 + (a->i_mb_partition16x8[1]>>2); a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8]; } static void mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] ); ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); h->mb.i_partition = D_8x16; a->i_cost8x16bi = 0; for( int i = 0; i < 2; i++ ) { int i_part_cost; int i_part_cost_bi = 0; intptr_t stride[2] = {8,8}; pixel *src[2]; x264_me_t m; m.i_pixel = PIXEL_8x16; LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 ); for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref }; int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; lX->me8x16[i].cost = INT_MAX; for( int j = 0; j < i_ref8s; j++ ) { int i_ref = ref8[j]; m.i_ref_cost = REF_COST( l, i_ref ); LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 ); CP32( mvc[0], lX->mvc[i_ref][0] ); CP32( mvc[1], lX->mvc[i_ref][i+1] ); CP32( mvc[2], lX->mvc[i_ref][i+3] ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref ); x264_mb_predict_mv( h, l, 4*i, 2, m.mvp ); x264_me_search( h, &m, mvc, 3 ); m.cost += m.i_ref_cost; if( m.cost < lX->me8x16[i].cost ) h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) ); } } /* BI mode */ src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0], a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none ); src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0], a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none ); h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] ); i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost + a->l1.me8x16[i].i_ref_cost; if( h->mb.b_chroma_me ) i_part_cost_bi += analyse_bi_chroma( h, a, i, PIXEL_8x16 ); i_part_cost = a->l0.me8x16[i].cost; a->i_mb_partition8x16[i] = D_L0_8x8; if( a->l1.me8x16[i].cost < i_part_cost ) { i_part_cost = a->l1.me8x16[i].cost; a->i_mb_partition8x16[i] = D_L1_8x8; } if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost ) { i_part_cost = i_part_cost_bi; a->i_mb_partition8x16[i] = D_BI_8x8; } a->i_cost8x16bi += i_part_cost; /* Early termination based on the current SATD score of partition[0] plus the estimated SATD score of partition[1] */ if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) ) { a->i_cost8x16bi = COST_MAX; return; } mb_cache_mv_b8x16( h, a, i, 0 ); } /* mb type cost */ a->i_mb_type8x16 = B_L0_L0 + (a->i_mb_partition8x16[0]>>2) * 3 + (a->i_mb_partition8x16[1]>>2); a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16]; } static void mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) { int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX; h->mb.i_type = P_L0; if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) ) { h->mb.i_partition = D_16x16; analyse_update_cache( h, a ); a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); } if( a->l0.i_cost16x8 < thresh ) { h->mb.i_partition = D_16x8; analyse_update_cache( h, a ); a->l0.i_cost16x8 = rd_cost_mb( h, a->i_lambda2 ); } else a->l0.i_cost16x8 = COST_MAX; if( a->l0.i_cost8x16 < thresh ) { h->mb.i_partition = D_8x16; analyse_update_cache( h, a ); a->l0.i_cost8x16 = rd_cost_mb( h, a->i_lambda2 ); } else a->l0.i_cost8x16 = COST_MAX; if( a->l0.i_cost8x8 < thresh ) { h->mb.i_type = P_8x8; h->mb.i_partition = D_8x8; if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) { x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref ); x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection * for future blocks are those left over from previous RDO calls. */ for( int i = 0; i < 4; i++ ) { int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost}; int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX; int subtype, btype = D_L0_8x8; uint64_t bcost = COST_MAX64; for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ ) { uint64_t cost; if( costs[subtype] > sub8x8_thresh ) continue; h->mb.i_sub_partition[i] = subtype; mb_cache_mv_p8x8( h, a, i ); if( subtype == btype ) continue; cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 ); COPY2_IF_LT( bcost, cost, btype, subtype ); } if( h->mb.i_sub_partition[i] != btype ) { h->mb.i_sub_partition[i] = btype; mb_cache_mv_p8x8( h, a, i ); } } } else analyse_update_cache( h, a ); a->l0.i_cost8x8 = rd_cost_mb( h, a->i_lambda2 ); } else a->l0.i_cost8x8 = COST_MAX; } static void mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) { int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX; if( a->b_direct_available && a->i_rd16x16direct == COST_MAX ) { h->mb.i_type = B_DIRECT; /* Assumes direct/skip MC is still in fdec */ /* Requires b-rdo to be done before intra analysis */ h->mb.b_skip_mc = 1; analyse_update_cache( h, a ); a->i_rd16x16direct = rd_cost_mb( h, a->i_lambda2 ); h->mb.b_skip_mc = 0; } //FIXME not all the update_cache calls are needed h->mb.i_partition = D_16x16; /* L0 */ if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX ) { h->mb.i_type = B_L0_L0; analyse_update_cache( h, a ); a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); } /* L1 */ if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX ) { h->mb.i_type = B_L1_L1; analyse_update_cache( h, a ); a->l1.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); } /* BI */ if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX ) { h->mb.i_type = B_BI_BI; analyse_update_cache( h, a ); a->i_rd16x16bi = rd_cost_mb( h, a->i_lambda2 ); } /* 8x8 */ if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX ) { h->mb.i_type = B_8x8; h->mb.i_partition = D_8x8; analyse_update_cache( h, a ); a->i_rd8x8bi = rd_cost_mb( h, a->i_lambda2 ); x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 ); } /* 16x8 */ if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX ) { h->mb.i_type = a->i_mb_type16x8; h->mb.i_partition = D_16x8; analyse_update_cache( h, a ); a->i_rd16x8bi = rd_cost_mb( h, a->i_lambda2 ); } /* 8x16 */ if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX ) { h->mb.i_type = a->i_mb_type8x16; h->mb.i_partition = D_8x16; analyse_update_cache( h, a ); a->i_rd8x16bi = rd_cost_mb( h, a->i_lambda2 ); } } static void refine_bidir( x264_t *h, x264_mb_analysis_t *a ) { int i_biweight; if( IS_INTRA(h->mb.i_type) ) return; switch( h->mb.i_partition ) { case D_16x16: if( h->mb.i_type == B_BI_BI ) { i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref]; x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight ); } break; case D_16x8: for( int i = 0; i < 2; i++ ) if( a->i_mb_partition16x8[i] == D_BI_8x8 ) { i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref]; x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight ); } break; case D_8x16: for( int i = 0; i < 2; i++ ) if( a->i_mb_partition8x16[i] == D_BI_8x8 ) { i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref]; x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight ); } break; case D_8x8: for( int i = 0; i < 4; i++ ) if( h->mb.i_sub_partition[i] == D_BI_8x8 ) { i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref]; x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight ); } break; } } static inline void mb_analyse_transform( x264_t *h ) { if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless ) { /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */ x264_mb_mc( h ); int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1; int i_cost8 = 0, i_cost4 = 0; /* Not all platforms have a merged SATD function */ if( h->pixf.sa8d_satd[PIXEL_16x16] ) { uint64_t cost = 0; for( int p = 0; p < plane_count; p++ ) { cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE ); } i_cost8 = (uint32_t)cost; i_cost4 = (uint32_t)(cost >> 32); } else { for( int p = 0; p < plane_count; p++ ) { i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE ); i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE ); } } h->mb.b_transform_8x8 = i_cost8 < i_cost4; h->mb.b_skip_mc = 1; } } static inline void mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd ) { if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode ) { uint32_t subpart_bak = M32( h->mb.i_sub_partition ); /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */ if( h->mb.i_type == P_8x8 ) M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101; else if( !x264_transform_allowed[h->mb.i_type] ) return; analyse_update_cache( h, a ); h->mb.b_transform_8x8 ^= 1; /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */ int i_rd8 = rd_cost_mb( h, a->i_lambda2 ); if( *i_rd >= i_rd8 ) { if( *i_rd > 0 ) *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd; *i_rd = i_rd8; } else { h->mb.b_transform_8x8 ^= 1; M32( h->mb.i_sub_partition ) = subpart_bak; } } } /* Rate-distortion optimal QP selection. * FIXME: More than half of the benefit of this function seems to be * in the way it improves the coding of chroma DC (by decimating or * finding a better way to code a single DC coefficient.) * There must be a more efficient way to get that portion of the benefit * without doing full QP-RD, but RD-decimation doesn't seem to do the * trick. */ static inline void mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a ) { int bcost, cost, failures, prevcost, origcost; int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp; int last_qp_tried = 0; origcost = bcost = rd_cost_mb( h, a->i_lambda2 ); int origcbp = h->mb.cbp[h->mb.i_mb_xy]; /* If CBP is already zero, don't raise the quantizer any higher. */ for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 ) { /* Without psy-RD, require monotonicity when moving quant away from previous * macroblock's quant; allow 1 failure when moving quant towards previous quant. * With psy-RD, allow 1 failure when moving quant away from previous quant, * allow 2 failures when moving quant towards previous quant. * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */ int threshold = (!!h->mb.i_psy_rd); /* Raise the threshold for failures if we're moving towards the last QP. */ if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) || ( h->mb.i_last_qp > orig_qp && direction == 1 ) ) threshold++; h->mb.i_qp = orig_qp; failures = 0; prevcost = origcost; /* If the current QP results in an empty CBP, it's highly likely that lower QPs * (up to a point) will too. So, jump down to where the threshold will kick in * and check the QP there. If the CBP is still empty, skip the main loop. * If it isn't empty, we would have ended up having to check this QP anyways, * so as long as we store it for later lookup, we lose nothing. */ int already_checked_qp = -1; int already_checked_cost = COST_MAX; if( direction == -1 ) { if( !origcbp ) { h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) ); h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; already_checked_cost = rd_cost_mb( h, a->i_lambda2 ); if( !h->mb.cbp[h->mb.i_mb_xy] ) { /* If our empty-CBP block is lower QP than the last QP, * the last QP almost surely doesn't have a CBP either. */ if( h->mb.i_last_qp > h->mb.i_qp ) last_qp_tried = 1; break; } already_checked_qp = h->mb.i_qp; h->mb.i_qp = orig_qp; } } h->mb.i_qp += direction; while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) ) { if( h->mb.i_last_qp == h->mb.i_qp ) last_qp_tried = 1; if( h->mb.i_qp == already_checked_qp ) cost = already_checked_cost; else { h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; cost = rd_cost_mb( h, a->i_lambda2 ); COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); } /* We can't assume that the costs are monotonic over QPs. * Tie case-as-failure seems to give better results. */ if( cost < prevcost ) failures = 0; else failures++; prevcost = cost; if( failures > threshold ) break; if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] ) break; h->mb.i_qp += direction; } } /* Always try the last block's QP. */ if( !last_qp_tried ) { h->mb.i_qp = h->mb.i_last_qp; h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; cost = rd_cost_mb( h, a->i_lambda2 ); COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); } h->mb.i_qp = bqp; h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; /* Check transform again; decision from before may no longer be optimal. */ if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 && x264_mb_transform_8x8_allowed( h ) ) { h->mb.b_transform_8x8 ^= 1; cost = rd_cost_mb( h, a->i_lambda2 ); if( cost > bcost ) h->mb.b_transform_8x8 ^= 1; } } /***************************************************************************** * x264_macroblock_analyse: *****************************************************************************/ void x264_macroblock_analyse( x264_t *h ) { x264_mb_analysis_t analysis; int i_cost = COST_MAX; h->mb.i_qp = x264_ratecontrol_mb_qp( h ); /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 ) h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp; if( h->param.analyse.b_mb_info ) h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */ mb_analyse_init( h, &analysis, h->mb.i_qp ); /*--------------------------- Do the analysis ---------------------------*/ if( h->sh.i_type == SLICE_TYPE_I ) { intra_analysis: if( analysis.i_mbrd ) mb_init_fenc_cache( h, analysis.i_mbrd >= 2 ); mb_analyse_intra( h, &analysis, COST_MAX ); if( analysis.i_mbrd ) intra_rd( h, &analysis, COST_MAX ); i_cost = analysis.i_satd_i16x16; h->mb.i_type = I_16x16; COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 ); COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 ); if( analysis.i_satd_pcm < i_cost ) h->mb.i_type = I_PCM; else if( analysis.i_mbrd >= 2 ) intra_rd_refine( h, &analysis ); } else if( h->sh.i_type == SLICE_TYPE_P ) { int b_skip = 0; h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 ); analysis.b_try_skip = 0; if( analysis.b_force_intra ) { if( !h->param.analyse.b_psy ) { mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); goto intra_analysis; } } else { /* Special fast-skip logic using information from mb_info. */ if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) ) { if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred && h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp ) { h->mb.i_partition = D_16x16; /* Use the P-SKIP MV if we can... */ if( !M32(h->mb.cache.pskip_mv) ) { b_skip = 1; h->mb.i_type = P_SKIP; } /* Otherwise, just force a 16x16 block. */ else { h->mb.i_type = P_L0; analysis.l0.me16x16.i_ref = 0; M32( analysis.l0.me16x16.mv ) = 0; } goto skip_analysis; } /* Reset the information accordingly */ else if( h->param.analyse.b_mb_info_update ) h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT; } int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1]; /* If the current macroblock is off the frame, just skip it. */ if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid ) b_skip = 1; /* Fast P_SKIP detection */ else if( h->param.analyse.b_fast_pskip ) { if( skip_invalid ) // FIXME don't need to check this if the reference frame is done {} else if( h->param.analyse.i_subpel_refine >= 3 ) analysis.b_try_skip = 1; else if( h->mb.i_mb_type_left[0] == P_SKIP || h->mb.i_mb_type_top == P_SKIP || h->mb.i_mb_type_topleft == P_SKIP || h->mb.i_mb_type_topright == P_SKIP ) b_skip = x264_macroblock_probe_pskip( h ); } } h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 ); if( b_skip ) { h->mb.i_type = P_SKIP; h->mb.i_partition = D_16x16; assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); skip_analysis: /* Set up MVs for future predictors */ for( int i = 0; i < h->mb.pic.i_fref[0]; i++ ) M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; } else { const unsigned int flags = h->param.analyse.inter; int i_type; int i_partition; int i_satd_inter, i_satd_intra; mb_analyse_load_costs( h, &analysis ); mb_analyse_inter_p16x16( h, &analysis ); if( h->mb.i_type == P_SKIP ) { for( int i = 1; i < h->mb.pic.i_fref[0]; i++ ) M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; return; } if( flags & X264_ANALYSE_PSUB16x16 ) { if( h->param.analyse.b_mixed_references ) mb_analyse_inter_p8x8_mixed_ref( h, &analysis ); else mb_analyse_inter_p8x8( h, &analysis ); } /* Select best inter mode */ i_type = P_L0; i_partition = D_16x16; i_cost = analysis.l0.me16x16.cost; if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate || analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) ) { i_type = P_8x8; i_partition = D_8x8; i_cost = analysis.l0.i_cost8x8; /* Do sub 8x8 */ if( flags & X264_ANALYSE_PSUB8x8 ) { for( int i = 0; i < 4; i++ ) { mb_analyse_inter_p4x4( h, &analysis, i ); int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv; if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 ) { int i_cost8x8 = analysis.l0.i_cost4x4[i]; h->mb.i_sub_partition[i] = D_L0_4x4; mb_analyse_inter_p8x4( h, &analysis, i ); COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i], h->mb.i_sub_partition[i], D_L0_8x4 ); mb_analyse_inter_p4x8( h, &analysis, i ); COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i], h->mb.i_sub_partition[i], D_L0_4x8 ); i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost; } mb_cache_mv_p8x8( h, &analysis, i ); } analysis.l0.i_cost8x8 = i_cost; } } /* Now do 16x8/8x16 */ int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv; if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate || analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) ) { int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1; analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost; mb_analyse_inter_p16x8( h, &analysis, i_cost ); COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 ); i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1; analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost; mb_analyse_inter_p8x16( h, &analysis, i_cost ); COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 ); } h->mb.i_partition = i_partition; /* refine qpel */ //FIXME mb_type costs? if( analysis.i_mbrd || !h->mb.i_subpel_refine ) { /* refine later */ } else if( i_partition == D_16x16 ) { x264_me_refine_qpel( h, &analysis.l0.me16x16 ); i_cost = analysis.l0.me16x16.cost; } else if( i_partition == D_16x8 ) { x264_me_refine_qpel( h, &analysis.l0.me16x8[0] ); x264_me_refine_qpel( h, &analysis.l0.me16x8[1] ); i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost; } else if( i_partition == D_8x16 ) { x264_me_refine_qpel( h, &analysis.l0.me8x16[0] ); x264_me_refine_qpel( h, &analysis.l0.me8x16[1] ); i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost; } else if( i_partition == D_8x8 ) { i_cost = 0; for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { switch( h->mb.i_sub_partition[i8x8] ) { case D_L0_8x8: x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] ); i_cost += analysis.l0.me8x8[i8x8].cost; break; case D_L0_8x4: x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] ); x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] ); i_cost += analysis.l0.me8x4[i8x8][0].cost + analysis.l0.me8x4[i8x8][1].cost; break; case D_L0_4x8: x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] ); x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] ); i_cost += analysis.l0.me4x8[i8x8][0].cost + analysis.l0.me4x8[i8x8][1].cost; break; case D_L0_4x4: x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] ); x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] ); x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] ); x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] ); i_cost += analysis.l0.me4x4[i8x8][0].cost + analysis.l0.me4x4[i8x8][1].cost + analysis.l0.me4x4[i8x8][2].cost + analysis.l0.me4x4[i8x8][3].cost; break; default: x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" ); break; } } } if( h->mb.b_chroma_me ) { if( CHROMA444 ) { mb_analyse_intra( h, &analysis, i_cost ); mb_analyse_intra_chroma( h, &analysis ); } else { mb_analyse_intra_chroma( h, &analysis ); mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma ); } analysis.i_satd_i16x16 += analysis.i_satd_chroma; analysis.i_satd_i8x8 += analysis.i_satd_chroma; analysis.i_satd_i4x4 += analysis.i_satd_chroma; } else mb_analyse_intra( h, &analysis, i_cost ); i_satd_inter = i_cost; i_satd_intra = X264_MIN3( analysis.i_satd_i16x16, analysis.i_satd_i8x8, analysis.i_satd_i4x4 ); if( analysis.i_mbrd ) { mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) ); i_type = P_L0; i_partition = D_16x16; i_cost = analysis.l0.i_rd16x16; COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 ); COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 ); COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 ); h->mb.i_type = i_type; h->mb.i_partition = i_partition; if( i_cost < COST_MAX ) mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost ); intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 ); } COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 ); COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 ); COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 ); COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM ); h->mb.i_type = i_type; if( analysis.b_force_intra && !IS_INTRA(i_type) ) { /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if * it was an inter block. */ analyse_update_cache( h, &analysis ); x264_macroblock_encode( h ); for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 ); if( !CHROMA444 ) { int height = 16 >> CHROMA_V_SHIFT; h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height ); } mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); goto intra_analysis; } if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM ) { if( IS_INTRA( h->mb.i_type ) ) { intra_rd_refine( h, &analysis ); } else if( i_partition == D_16x16 ) { x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref ); analysis.l0.me16x16.cost = i_cost; x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 ); } else if( i_partition == D_16x8 ) { M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref ); x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 ); } else if( i_partition == D_8x16 ) { M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref ); x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref ); x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 ); } else if( i_partition == D_8x8 ) { analyse_update_cache( h, &analysis ); for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 ) { x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 ); } else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 ) { x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 ); } else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 ) { x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 ); } else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 ) { x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 ); x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 ); } } } } } } else if( h->sh.i_type == SLICE_TYPE_B ) { int i_bskip_cost = COST_MAX; int b_skip = 0; if( analysis.i_mbrd ) mb_init_fenc_cache( h, analysis.i_mbrd >= 2 ); h->mb.i_type = B_SKIP; if( h->mb.b_direct_auto_write ) { /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */ for( int i = 0; i < 2; i++ ) { int b_changed = 1; h->sh.b_direct_spatial_mv_pred ^= 1; analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL ); if( analysis.b_direct_available ) { if( b_changed ) { x264_mb_mc( h ); b_skip = x264_macroblock_probe_bskip( h ); } h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip; } else b_skip = 0; } } else analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL ); analysis.b_try_skip = 0; if( analysis.b_direct_available ) { if( !h->mb.b_direct_auto_write ) x264_mb_mc( h ); /* If the current macroblock is off the frame, just skip it. */ if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height ) b_skip = 1; else if( analysis.i_mbrd ) { i_bskip_cost = ssd_mb( h ); /* 6 = minimum cavlc cost of a non-skipped MB */ b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8); } else if( !h->mb.b_direct_auto_write ) { /* Conditioning the probe on neighboring block types * doesn't seem to help speed or quality. */ analysis.b_try_skip = x264_macroblock_probe_bskip( h ); if( h->param.analyse.i_subpel_refine < 3 ) b_skip = analysis.b_try_skip; } /* Set up MVs for future predictors */ if( b_skip ) { for( int i = 0; i < h->mb.pic.i_fref[0]; i++ ) M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; for( int i = 0; i < h->mb.pic.i_fref[1]; i++ ) M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0; } } if( !b_skip ) { const unsigned int flags = h->param.analyse.inter; int i_type; int i_partition; int i_satd_inter; h->mb.b_skip_mc = 0; h->mb.i_type = B_DIRECT; mb_analyse_load_costs( h, &analysis ); /* select best inter mode */ /* direct must be first */ if( analysis.b_direct_available ) mb_analyse_inter_direct( h, &analysis ); mb_analyse_inter_b16x16( h, &analysis ); if( h->mb.i_type == B_SKIP ) { for( int i = 1; i < h->mb.pic.i_fref[0]; i++ ) M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; for( int i = 1; i < h->mb.pic.i_fref[1]; i++ ) M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0; return; } i_type = B_L0_L0; i_partition = D_16x16; i_cost = analysis.l0.me16x16.cost; COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 ); COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI ); COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT ); if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 ) { mb_analyse_b_rd( h, &analysis, i_cost ); if( i_bskip_cost < analysis.i_rd16x16direct && i_bskip_cost < analysis.i_rd16x16bi && i_bskip_cost < analysis.l0.i_rd16x16 && i_bskip_cost < analysis.l1.i_rd16x16 ) { h->mb.i_type = B_SKIP; analyse_update_cache( h, &analysis ); return; } } if( flags & X264_ANALYSE_BSUB16x16 ) { if( h->param.analyse.b_mixed_references ) mb_analyse_inter_b8x8_mixed_ref( h, &analysis ); else mb_analyse_inter_b8x8( h, &analysis ); COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 ); /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */ int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0; int i_mb_type, i_partition16x8[2], i_partition8x16[2]; for( int i = 0; i < 2; i++ ) { int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost; int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost; // 16x8 i_best_cost = COST_MAX; i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1]; i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1]; i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1]; avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1; avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1; COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 ); COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 ); COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 ); analysis.i_cost_est16x8[i] = i_best_cost; // 8x16 i_best_cost = COST_MAX; i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2]; i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2]; i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2]; avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1; avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1; COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 ); COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 ); COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 ); analysis.i_cost_est8x16[i] = i_best_cost; } i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2); analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type]; i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1]; i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2); analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type]; i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1]; /* We can gain a little speed by checking the mode with the lowest estimated cost first */ int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total; if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) ) { mb_analyse_inter_b16x8( h, &analysis, i_cost ); COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 ); } if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost ) { mb_analyse_inter_b8x16( h, &analysis, i_cost ); COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 ); } if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) ) { mb_analyse_inter_b16x8( h, &analysis, i_cost ); COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 ); } } if( analysis.i_mbrd || !h->mb.i_subpel_refine ) { /* refine later */ } /* refine qpel */ else if( i_partition == D_16x16 ) { analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0]; analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1]; if( i_type == B_L0_L0 ) { x264_me_refine_qpel( h, &analysis.l0.me16x16 ); i_cost = analysis.l0.me16x16.cost + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0]; } else if( i_type == B_L1_L1 ) { x264_me_refine_qpel( h, &analysis.l1.me16x16 ); i_cost = analysis.l1.me16x16.cost + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1]; } else if( i_type == B_BI_BI ) { x264_me_refine_qpel( h, &analysis.l0.bi16x16 ); x264_me_refine_qpel( h, &analysis.l1.bi16x16 ); } } else if( i_partition == D_16x8 ) { for( int i = 0; i < 2; i++ ) { if( analysis.i_mb_partition16x8[i] != D_L1_8x8 ) x264_me_refine_qpel( h, &analysis.l0.me16x8[i] ); if( analysis.i_mb_partition16x8[i] != D_L0_8x8 ) x264_me_refine_qpel( h, &analysis.l1.me16x8[i] ); } } else if( i_partition == D_8x16 ) { for( int i = 0; i < 2; i++ ) { if( analysis.i_mb_partition8x16[i] != D_L1_8x8 ) x264_me_refine_qpel( h, &analysis.l0.me8x16[i] ); if( analysis.i_mb_partition8x16[i] != D_L0_8x8 ) x264_me_refine_qpel( h, &analysis.l1.me8x16[i] ); } } else if( i_partition == D_8x8 ) { for( int i = 0; i < 4; i++ ) { x264_me_t *m; int i_part_cost_old; int i_type_cost; int i_part_type = h->mb.i_sub_partition[i]; int b_bidir = (i_part_type == D_BI_8x8); if( i_part_type == D_DIRECT_8x8 ) continue; if( x264_mb_partition_listX_table[0][i_part_type] ) { m = &analysis.l0.me8x8[i]; i_part_cost_old = m->cost; i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; m->cost -= i_type_cost; x264_me_refine_qpel( h, m ); if( !b_bidir ) analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old; } if( x264_mb_partition_listX_table[1][i_part_type] ) { m = &analysis.l1.me8x8[i]; i_part_cost_old = m->cost; i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; m->cost -= i_type_cost; x264_me_refine_qpel( h, m ); if( !b_bidir ) analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old; } /* TODO: update mvp? */ } } i_satd_inter = i_cost; if( analysis.i_mbrd ) { mb_analyse_b_rd( h, &analysis, i_satd_inter ); i_type = B_SKIP; i_cost = i_bskip_cost; i_partition = D_16x16; COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 ); COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 ); COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI ); COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT ); COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 ); COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 ); COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 ); h->mb.i_type = i_type; h->mb.i_partition = i_partition; } if( h->mb.b_chroma_me ) { if( CHROMA444 ) { mb_analyse_intra( h, &analysis, i_satd_inter ); mb_analyse_intra_chroma( h, &analysis ); } else { mb_analyse_intra_chroma( h, &analysis ); mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma ); } analysis.i_satd_i16x16 += analysis.i_satd_chroma; analysis.i_satd_i8x8 += analysis.i_satd_chroma; analysis.i_satd_i4x4 += analysis.i_satd_chroma; } else mb_analyse_intra( h, &analysis, i_satd_inter ); if( analysis.i_mbrd ) { mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost ); intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 ); } COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 ); COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 ); COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 ); COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM ); h->mb.i_type = i_type; h->mb.i_partition = i_partition; if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM ) intra_rd_refine( h, &analysis ); if( h->mb.i_subpel_refine >= 5 ) refine_bidir( h, &analysis ); if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP ) { int i_biweight; analyse_update_cache( h, &analysis ); if( i_partition == D_16x16 ) { if( i_type == B_L0_L0 ) { analysis.l0.me16x16.cost = i_cost; x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 ); } else if( i_type == B_L1_L1 ) { analysis.l1.me16x16.cost = i_cost; x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 ); } else if( i_type == B_BI_BI ) { i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 ); } } else if( i_partition == D_16x8 ) { for( int i = 0; i < 2; i++ ) { h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i]; if( analysis.i_mb_partition16x8[i] == D_L0_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 ); else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 ); else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 ) { i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 ); } } } else if( i_partition == D_8x16 ) { for( int i = 0; i < 2; i++ ) { h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i]; if( analysis.i_mb_partition8x16[i] == D_L0_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 ); else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 ); else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 ) { i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 ); } } } else if( i_partition == D_8x8 ) { for( int i = 0; i < 4; i++ ) { if( h->mb.i_sub_partition[i] == D_L0_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 ); else if( h->mb.i_sub_partition[i] == D_L1_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 ); else if( h->mb.i_sub_partition[i] == D_BI_8x8 ) { i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 ); } } } } } } analyse_update_cache( h, &analysis ); /* In rare cases we can end up qpel-RDing our way back to a larger partition size * without realizing it. Check for this and account for it if necessary. */ if( analysis.i_mbrd >= 2 ) { /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */ static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2}; int list = check_mv_lists[h->mb.i_type] - 1; if( list >= 0 && h->mb.i_partition != D_16x16 && M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) && h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] ) h->mb.i_partition = D_16x16; } if( !analysis.i_mbrd ) mb_analyse_transform( h ); if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) ) mb_analyse_qp_rd( h, &analysis ); h->mb.b_trellis = h->param.analyse.i_trellis; h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type )); if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 ) psy_trellis_init( h, 0 ); if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction ) h->mb.i_skip_intra = 0; } /*-------------------- Update MB from the analysis ----------------------*/ static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) { switch( h->mb.i_type ) { case I_4x4: for( int i = 0; i < 16; i++ ) h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i]; mb_analyse_intra_chroma( h, a ); break; case I_8x8: for( int i = 0; i < 4; i++ ) x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] ); mb_analyse_intra_chroma( h, a ); break; case I_16x16: h->mb.i_intra16x16_pred_mode = a->i_predict16x16; mb_analyse_intra_chroma( h, a ); break; case I_PCM: break; case P_L0: switch( h->mb.i_partition ) { case D_16x16: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); break; case D_16x8: x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv ); x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv ); break; case D_8x16: x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref ); x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv ); x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv ); break; default: x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition ); break; } break; case P_8x8: x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref ); x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); for( int i = 0; i < 4; i++ ) mb_cache_mv_p8x8( h, a, i ); break; case P_SKIP: { h->mb.i_partition = D_16x16; x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv ); break; } case B_SKIP: case B_DIRECT: h->mb.i_partition = h->mb.cache.direct_partition; mb_load_mv_direct8x8( h, 0 ); mb_load_mv_direct8x8( h, 1 ); mb_load_mv_direct8x8( h, 2 ); mb_load_mv_direct8x8( h, 3 ); break; case B_8x8: /* optimize: cache might not need to be rewritten */ for( int i = 0; i < 4; i++ ) mb_cache_mv_b8x8( h, a, i, 1 ); break; default: /* the rest of the B types */ switch( h->mb.i_partition ) { case D_16x16: switch( h->mb.i_type ) { case B_L0_L0: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 ); x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 ); x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 ); break; case B_L1_L1: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 ); x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 ); x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv ); break; case B_BI_BI: x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv ); break; } break; case D_16x8: mb_cache_mv_b16x8( h, a, 0, 1 ); mb_cache_mv_b16x8( h, a, 1, 1 ); break; case D_8x16: mb_cache_mv_b8x16( h, a, 0, 1 ); mb_cache_mv_b8x16( h, a, 1, 1 ); break; default: x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" ); break; } } #ifndef NDEBUG if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) ) { for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) { int completed; int ref = h->mb.cache.ref[l][x264_scan8[0]]; if( ref < 0 ) continue; completed = x264_frame_cond_wait( h->fref[l][ ref >> MB_INTERLACED ]->orig, -1 ); if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed ) { x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n"); x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type); x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref, h->mb.cache.mv[l][x264_scan8[15]][0], h->mb.cache.mv[l][x264_scan8[15]][1] ); x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]); x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y); x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed ); x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n"); mb_analyse_intra( h, a, COST_MAX ); h->mb.i_type = I_16x16; h->mb.i_intra16x16_pred_mode = a->i_predict16x16; mb_analyse_intra_chroma( h, a ); } } } #endif } #include "slicetype.c" x264-master/encoder/analyse.h000066400000000000000000000050471502133446700163110ustar00rootroot00000000000000/***************************************************************************** * analyse.h: macroblock analysis ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ENCODER_ANALYSE_H #define X264_ENCODER_ANALYSE_H #define x264_analyse_init_costs x264_template(analyse_init_costs) int x264_analyse_init_costs( x264_t *h ); #define x264_analyse_free_costs x264_template(analyse_free_costs) void x264_analyse_free_costs( x264_t *h ); #define x264_analyse_weight_frame x264_template(analyse_weight_frame) void x264_analyse_weight_frame( x264_t *h, int end ); #define x264_macroblock_analyse x264_template(macroblock_analyse) void x264_macroblock_analyse( x264_t *h ); #define x264_slicetype_decide x264_template(slicetype_decide) void x264_slicetype_decide( x264_t *h ); #define x264_slicetype_analyse x264_template(slicetype_analyse) void x264_slicetype_analyse( x264_t *h, int intra_minigop ); #define x264_lookahead_init x264_template(lookahead_init) int x264_lookahead_init( x264_t *h, int i_slicetype_length ); #define x264_lookahead_is_empty x264_template(lookahead_is_empty) int x264_lookahead_is_empty( x264_t *h ); #define x264_lookahead_put_frame x264_template(lookahead_put_frame) void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame ); #define x264_lookahead_get_frames x264_template(lookahead_get_frames) void x264_lookahead_get_frames( x264_t *h ); #define x264_lookahead_delete x264_template(lookahead_delete) void x264_lookahead_delete( x264_t *h ); #endif x264-master/encoder/api.c000066400000000000000000000163761502133446700154300ustar00rootroot00000000000000/***************************************************************************** * api.c: bit depth independent interface ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Vittorio Giovara * Luca Barbato * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/base.h" /**************************************************************************** * global symbols ****************************************************************************/ const int x264_chroma_format = X264_CHROMA_FORMAT; x264_t *x264_8_encoder_open( x264_param_t *, void * ); void x264_8_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ); int x264_8_encoder_reconfig( x264_t *, x264_param_t * ); void x264_8_encoder_parameters( x264_t *, x264_param_t * ); int x264_8_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal ); int x264_8_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ); void x264_8_encoder_close( x264_t * ); int x264_8_encoder_delayed_frames( x264_t * ); int x264_8_encoder_maximum_delayed_frames( x264_t * ); void x264_8_encoder_intra_refresh( x264_t * ); int x264_8_encoder_invalidate_reference( x264_t *, int64_t pts ); x264_t *x264_10_encoder_open( x264_param_t *, void * ); void x264_10_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ); int x264_10_encoder_reconfig( x264_t *, x264_param_t * ); void x264_10_encoder_parameters( x264_t *, x264_param_t * ); int x264_10_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal ); int x264_10_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ); void x264_10_encoder_close( x264_t * ); int x264_10_encoder_delayed_frames( x264_t * ); int x264_10_encoder_maximum_delayed_frames( x264_t * ); void x264_10_encoder_intra_refresh( x264_t * ); int x264_10_encoder_invalidate_reference( x264_t *, int64_t pts ); typedef struct x264_api_t { /* Internal reference to x264_t data */ x264_t *x264; /* API entry points */ void (*nal_encode)( x264_t *h, uint8_t *dst, x264_nal_t *nal ); int (*encoder_reconfig)( x264_t *, x264_param_t * ); void (*encoder_parameters)( x264_t *, x264_param_t * ); int (*encoder_headers)( x264_t *, x264_nal_t **pp_nal, int *pi_nal ); int (*encoder_encode)( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ); void (*encoder_close)( x264_t * ); int (*encoder_delayed_frames)( x264_t * ); int (*encoder_maximum_delayed_frames)( x264_t * ); void (*encoder_intra_refresh)( x264_t * ); int (*encoder_invalidate_reference)( x264_t *, int64_t pts ); } x264_api_t; REALIGN_STACK x264_t *x264_encoder_open( x264_param_t *param ) { x264_api_t *api = calloc( 1, sizeof( x264_api_t ) ); if( !api ) return NULL; #if HAVE_BITDEPTH8 if( param->i_bitdepth == 8 ) { api->nal_encode = x264_8_nal_encode; api->encoder_reconfig = x264_8_encoder_reconfig; api->encoder_parameters = x264_8_encoder_parameters; api->encoder_headers = x264_8_encoder_headers; api->encoder_encode = x264_8_encoder_encode; api->encoder_close = x264_8_encoder_close; api->encoder_delayed_frames = x264_8_encoder_delayed_frames; api->encoder_maximum_delayed_frames = x264_8_encoder_maximum_delayed_frames; api->encoder_intra_refresh = x264_8_encoder_intra_refresh; api->encoder_invalidate_reference = x264_8_encoder_invalidate_reference; api->x264 = x264_8_encoder_open( param, api ); } else #endif #if HAVE_BITDEPTH10 if( param->i_bitdepth == 10 ) { api->nal_encode = x264_10_nal_encode; api->encoder_reconfig = x264_10_encoder_reconfig; api->encoder_parameters = x264_10_encoder_parameters; api->encoder_headers = x264_10_encoder_headers; api->encoder_encode = x264_10_encoder_encode; api->encoder_close = x264_10_encoder_close; api->encoder_delayed_frames = x264_10_encoder_delayed_frames; api->encoder_maximum_delayed_frames = x264_10_encoder_maximum_delayed_frames; api->encoder_intra_refresh = x264_10_encoder_intra_refresh; api->encoder_invalidate_reference = x264_10_encoder_invalidate_reference; api->x264 = x264_10_encoder_open( param, api ); } else #endif x264_log_internal( X264_LOG_ERROR, "not compiled with %d bit depth support\n", param->i_bitdepth ); if( !api->x264 ) { free( api ); return NULL; } /* x264_t is opaque */ return (x264_t *)api; } REALIGN_STACK void x264_encoder_close( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; api->encoder_close( api->x264 ); free( api ); } REALIGN_STACK void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ) { x264_api_t *api = (x264_api_t *)h; api->nal_encode( api->x264, dst, nal ); } REALIGN_STACK int x264_encoder_reconfig( x264_t *h, x264_param_t *param) { x264_api_t *api = (x264_api_t *)h; return api->encoder_reconfig( api->x264, param ); } REALIGN_STACK void x264_encoder_parameters( x264_t *h, x264_param_t *param ) { x264_api_t *api = (x264_api_t *)h; api->encoder_parameters( api->x264, param ); } REALIGN_STACK int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) { x264_api_t *api = (x264_api_t *)h; return api->encoder_headers( api->x264, pp_nal, pi_nal ); } REALIGN_STACK int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ) { x264_api_t *api = (x264_api_t *)h; return api->encoder_encode( api->x264, pp_nal, pi_nal, pic_in, pic_out ); } REALIGN_STACK int x264_encoder_delayed_frames( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; return api->encoder_delayed_frames( api->x264 ); } REALIGN_STACK int x264_encoder_maximum_delayed_frames( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; return api->encoder_maximum_delayed_frames( api->x264 ); } REALIGN_STACK void x264_encoder_intra_refresh( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; api->encoder_intra_refresh( api->x264 ); } REALIGN_STACK int x264_encoder_invalidate_reference( x264_t *h, int64_t pts ) { x264_api_t *api = (x264_api_t *)h; return api->encoder_invalidate_reference( api->x264, pts ); } x264-master/encoder/cabac.c000066400000000000000000001317201502133446700156770ustar00rootroot00000000000000/***************************************************************************** * cabac.c: cabac bitstream writing ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #ifndef RDO_SKIP_BS #define RDO_SKIP_BS 0 #endif static inline void cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type, int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 ) { if( i_mb_type == I_4x4 || i_mb_type == I_8x8 ) { x264_cabac_encode_decision_noup( cb, ctx0, 0 ); } #if !RDO_SKIP_BS else if( i_mb_type == I_PCM ) { x264_cabac_encode_decision_noup( cb, ctx0, 1 ); x264_cabac_encode_flush( h, cb ); } #endif else { int i_pred = x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode]; x264_cabac_encode_decision_noup( cb, ctx0, 1 ); x264_cabac_encode_terminal( cb ); x264_cabac_encode_decision_noup( cb, ctx1, !!h->mb.i_cbp_luma ); if( h->mb.i_cbp_chroma == 0 ) x264_cabac_encode_decision_noup( cb, ctx2, 0 ); else { x264_cabac_encode_decision( cb, ctx2, 1 ); x264_cabac_encode_decision_noup( cb, ctx3, h->mb.i_cbp_chroma>>1 ); } x264_cabac_encode_decision( cb, ctx4, i_pred>>1 ); x264_cabac_encode_decision_noup( cb, ctx5, i_pred&1 ); } } #if !RDO_SKIP_BS static void cabac_field_decoding_flag( x264_t *h, x264_cabac_t *cb ) { int ctx = 0; ctx += h->mb.field_decoding_flag & !!h->mb.i_mb_x; ctx += (h->mb.i_mb_top_mbpair_xy >= 0 && h->mb.slice_table[h->mb.i_mb_top_mbpair_xy] == h->sh.i_first_mb && h->mb.field[h->mb.i_mb_top_mbpair_xy]); x264_cabac_encode_decision_noup( cb, 70 + ctx, MB_INTERLACED ); h->mb.field_decoding_flag = MB_INTERLACED; } #endif static void cabac_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode ) { if( i_pred == i_mode ) x264_cabac_encode_decision( cb, 68, 1 ); else { x264_cabac_encode_decision( cb, 68, 0 ); if( i_mode > i_pred ) i_mode--; x264_cabac_encode_decision( cb, 69, (i_mode )&0x01 ); x264_cabac_encode_decision( cb, 69, (i_mode >> 1)&0x01 ); x264_cabac_encode_decision( cb, 69, (i_mode >> 2) ); } } static void cabac_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb ) { int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]; int ctx = 0; /* No need to test for I4x4 or I_16x16 as cache_save handle that */ if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.chroma_pred_mode[h->mb.i_mb_top_xy] != 0 ) ctx++; x264_cabac_encode_decision_noup( cb, 64 + ctx, i_mode > 0 ); if( i_mode > 0 ) { x264_cabac_encode_decision( cb, 64 + 3, i_mode > 1 ); if( i_mode > 1 ) x264_cabac_encode_decision_noup( cb, 64 + 3, i_mode > 2 ); } } static void cabac_cbp_luma( x264_t *h, x264_cabac_t *cb ) { int cbp = h->mb.i_cbp_luma; int cbp_l = h->mb.cache.i_cbp_left; int cbp_t = h->mb.cache.i_cbp_top; x264_cabac_encode_decision ( cb, 76 - ((cbp_l >> 1) & 1) - ((cbp_t >> 1) & 2), (cbp >> 0) & 1 ); x264_cabac_encode_decision ( cb, 76 - ((cbp >> 0) & 1) - ((cbp_t >> 2) & 2), (cbp >> 1) & 1 ); x264_cabac_encode_decision ( cb, 76 - ((cbp_l >> 3) & 1) - ((cbp << 1) & 2), (cbp >> 2) & 1 ); x264_cabac_encode_decision_noup( cb, 76 - ((cbp >> 2) & 1) - ((cbp >> 0) & 2), (cbp >> 3) & 1 ); } static void cabac_cbp_chroma( x264_t *h, x264_cabac_t *cb ) { int cbp_a = h->mb.cache.i_cbp_left & 0x30; int cbp_b = h->mb.cache.i_cbp_top & 0x30; int ctx = 0; if( cbp_a && h->mb.cache.i_cbp_left != -1 ) ctx++; if( cbp_b && h->mb.cache.i_cbp_top != -1 ) ctx+=2; if( h->mb.i_cbp_chroma == 0 ) x264_cabac_encode_decision_noup( cb, 77 + ctx, 0 ); else { x264_cabac_encode_decision_noup( cb, 77 + ctx, 1 ); ctx = 4; if( cbp_a == 0x20 ) ctx++; if( cbp_b == 0x20 ) ctx += 2; x264_cabac_encode_decision_noup( cb, 77 + ctx, h->mb.i_cbp_chroma >> 1 ); } } static void cabac_qp_delta( x264_t *h, x264_cabac_t *cb ) { int i_dqp = h->mb.i_qp - h->mb.i_last_qp; int ctx; /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely * flat background area. Don't do this if it would raise the quantizer, since that could * cause unexpected deblocking artifacts. */ if( h->mb.i_type == I_16x16 && !h->mb.cbp[h->mb.i_mb_xy] && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; #endif i_dqp = 0; } ctx = h->mb.i_last_dqp && (h->mb.type[h->mb.i_mb_prev_xy] == I_16x16 || (h->mb.cbp[h->mb.i_mb_prev_xy]&0x3f)); if( i_dqp != 0 ) { /* Faster than (i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp-1)). * If you so much as sneeze on these lines, gcc will compile this suboptimally. */ i_dqp *= 2; int val = 1 - i_dqp; if( val < 0 ) val = i_dqp; val--; /* dqp is interpreted modulo (QP_MAX_SPEC+1) */ if( val >= QP_MAX_SPEC && val != QP_MAX_SPEC+1 ) val = 2*QP_MAX_SPEC+1 - val; do { x264_cabac_encode_decision( cb, 60 + ctx, 1 ); ctx = 2+(ctx>>1); } while( --val ); } x264_cabac_encode_decision_noup( cb, 60 + ctx, 0 ); } #if !RDO_SKIP_BS void x264_cabac_mb_skip( x264_t *h, int b_skip ) { int ctx = h->mb.cache.i_neighbour_skip + 11; if( h->sh.i_type != SLICE_TYPE_P ) ctx += 13; x264_cabac_encode_decision( &h->cabac, ctx, b_skip ); } #endif static inline void cabac_subpartition_p( x264_cabac_t *cb, int i_sub ) { if( i_sub == D_L0_8x8 ) { x264_cabac_encode_decision( cb, 21, 1 ); return; } x264_cabac_encode_decision( cb, 21, 0 ); if( i_sub == D_L0_8x4 ) x264_cabac_encode_decision( cb, 22, 0 ); else { x264_cabac_encode_decision( cb, 22, 1 ); x264_cabac_encode_decision( cb, 23, i_sub == D_L0_4x8 ); } } static ALWAYS_INLINE void cabac_subpartition_b( x264_cabac_t *cb, int i_sub ) { if( i_sub == D_DIRECT_8x8 ) { x264_cabac_encode_decision( cb, 36, 0 ); return; } x264_cabac_encode_decision( cb, 36, 1 ); if( i_sub == D_BI_8x8 ) { x264_cabac_encode_decision( cb, 37, 1 ); x264_cabac_encode_decision( cb, 38, 0 ); x264_cabac_encode_decision( cb, 39, 0 ); x264_cabac_encode_decision( cb, 39, 0 ); return; } x264_cabac_encode_decision( cb, 37, 0 ); x264_cabac_encode_decision( cb, 39, i_sub == D_L1_8x8 ); } static ALWAYS_INLINE void cabac_transform_size( x264_t *h, x264_cabac_t *cb ) { int ctx = 399 + h->mb.cache.i_neighbour_transform_size; x264_cabac_encode_decision_noup( cb, ctx, h->mb.b_transform_8x8 ); } static ALWAYS_INLINE void cabac_ref_internal( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int bframe ) { const int i8 = x264_scan8[idx]; const int i_refa = h->mb.cache.ref[i_list][i8 - 1]; const int i_refb = h->mb.cache.ref[i_list][i8 - 8]; int ctx = 0; if( i_refa > 0 && (!bframe || !h->mb.cache.skip[i8 - 1]) ) ctx++; if( i_refb > 0 && (!bframe || !h->mb.cache.skip[i8 - 8]) ) ctx += 2; for( int i_ref = h->mb.cache.ref[i_list][i8]; i_ref > 0; i_ref-- ) { x264_cabac_encode_decision( cb, 54 + ctx, 1 ); ctx = (ctx>>2)+4; } x264_cabac_encode_decision( cb, 54 + ctx, 0 ); } static NOINLINE void cabac_ref_p( x264_t *h, x264_cabac_t *cb, int idx ) { cabac_ref_internal( h, cb, 0, idx, 0 ); } static NOINLINE void cabac_ref_b( x264_t *h, x264_cabac_t *cb, int i_list, int idx ) { cabac_ref_internal( h, cb, i_list, idx, 1 ); } static ALWAYS_INLINE int cabac_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd, int ctx ) { int ctxbase = l ? 47 : 40; if( mvd == 0 ) { x264_cabac_encode_decision( cb, ctxbase + ctx, 0 ); return 0; } int i_abs = abs( mvd ); x264_cabac_encode_decision( cb, ctxbase + ctx, 1 ); #if RDO_SKIP_BS if( i_abs <= 3 ) { for( int i = 1; i < i_abs; i++ ) x264_cabac_encode_decision( cb, ctxbase + i + 2, 1 ); x264_cabac_encode_decision( cb, ctxbase + i_abs + 2, 0 ); x264_cabac_encode_bypass( cb, mvd >> 31 ); } else { x264_cabac_encode_decision( cb, ctxbase + 3, 1 ); x264_cabac_encode_decision( cb, ctxbase + 4, 1 ); x264_cabac_encode_decision( cb, ctxbase + 5, 1 ); if( i_abs < 9 ) { cb->f8_bits_encoded += x264_cabac_size_unary[i_abs - 3][cb->state[ctxbase+6]]; cb->state[ctxbase+6] = x264_cabac_transition_unary[i_abs - 3][cb->state[ctxbase+6]]; } else { cb->f8_bits_encoded += cabac_size_5ones[cb->state[ctxbase+6]]; cb->state[ctxbase+6] = cabac_transition_5ones[cb->state[ctxbase+6]]; x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); } } #else static const uint8_t ctxes[8] = { 3,4,5,6,6,6,6,6 }; if( i_abs < 9 ) { for( int i = 1; i < i_abs; i++ ) x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); x264_cabac_encode_decision( cb, ctxbase + ctxes[i_abs-1], 0 ); } else { for( int i = 1; i < 9; i++ ) x264_cabac_encode_decision( cb, ctxbase + ctxes[i-1], 1 ); x264_cabac_encode_ue_bypass( cb, 3, i_abs - 9 ); } x264_cabac_encode_bypass( cb, mvd >> 31 ); #endif /* Since we don't need to keep track of MVDs larger than 66, just cap the value. * This lets us store MVDs as 8-bit values instead of 16-bit. */ return X264_MIN( i_abs, 66 ); } static NOINLINE uint16_t cabac_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width ) { ALIGNED_4( int16_t mvp[2] ); int mdx, mdy; /* Calculate mvd */ x264_mb_predict_mv( h, i_list, idx, width, mvp ); mdx = h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0]; mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1]; uint16_t amvd = x264_cabac_mvd_sum(h->mb.cache.mvd[i_list][x264_scan8[idx] - 1], h->mb.cache.mvd[i_list][x264_scan8[idx] - 8]); /* encode */ mdx = cabac_mvd_cpn( h, cb, i_list, idx, 0, mdx, amvd&0xFF ); mdy = cabac_mvd_cpn( h, cb, i_list, idx, 1, mdy, amvd>>8 ); return pack8to16(mdx,mdy); } #define cabac_mvd(h,cb,i_list,idx,width,height)\ do\ {\ uint16_t mvd = cabac_mvd(h,cb,i_list,idx,width);\ x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\ } while( 0 ) static inline void cabac_8x8_mvd( x264_t *h, x264_cabac_t *cb, int i ) { switch( h->mb.i_sub_partition[i] ) { case D_L0_8x8: cabac_mvd( h, cb, 0, 4*i, 2, 2 ); break; case D_L0_8x4: cabac_mvd( h, cb, 0, 4*i+0, 2, 1 ); cabac_mvd( h, cb, 0, 4*i+2, 2, 1 ); break; case D_L0_4x8: cabac_mvd( h, cb, 0, 4*i+0, 1, 2 ); cabac_mvd( h, cb, 0, 4*i+1, 1, 2 ); break; case D_L0_4x4: cabac_mvd( h, cb, 0, 4*i+0, 1, 1 ); cabac_mvd( h, cb, 0, 4*i+1, 1, 1 ); cabac_mvd( h, cb, 0, 4*i+2, 1, 1 ); cabac_mvd( h, cb, 0, 4*i+3, 1, 1 ); break; default: assert(0); } } static ALWAYS_INLINE void cabac_mb_header_i( x264_t *h, x264_cabac_t *cb, int i_mb_type, int slice_type, int chroma ) { if( slice_type == SLICE_TYPE_I ) { int ctx = 0; if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != I_4x4 ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != I_4x4 ) ctx++; cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 ); } else if( slice_type == SLICE_TYPE_P ) { /* prefix */ x264_cabac_encode_decision_noup( cb, 14, 1 ); /* suffix */ cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 ); } else if( slice_type == SLICE_TYPE_B ) { /* prefix */ x264_cabac_encode_decision_noup( cb, 27+3, 1 ); x264_cabac_encode_decision_noup( cb, 27+4, 1 ); x264_cabac_encode_decision( cb, 27+5, 1 ); x264_cabac_encode_decision( cb, 27+5, 0 ); x264_cabac_encode_decision( cb, 27+5, 1 ); /* suffix */ cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 ); } if( i_mb_type == I_PCM ) return; if( i_mb_type != I_16x16 ) { if( h->pps->b_transform_8x8_mode ) cabac_transform_size( h, cb ); int di = h->mb.b_transform_8x8 ? 4 : 1; for( int i = 0; i < 16; i += di ) { const int i_pred = x264_mb_predict_intra4x4_mode( h, i ); const int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] ); cabac_intra4x4_pred_mode( cb, i_pred, i_mode ); } } if( chroma ) cabac_intra_chroma_pred_mode( h, cb ); } static ALWAYS_INLINE void cabac_mb_header_p( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma ) { if( i_mb_type == P_L0 ) { x264_cabac_encode_decision_noup( cb, 14, 0 ); if( h->mb.i_partition == D_16x16 ) { x264_cabac_encode_decision_noup( cb, 15, 0 ); x264_cabac_encode_decision_noup( cb, 16, 0 ); if( h->mb.pic.i_fref[0] > 1 ) cabac_ref_p( h, cb, 0 ); cabac_mvd( h, cb, 0, 0, 4, 4 ); } else if( h->mb.i_partition == D_16x8 ) { x264_cabac_encode_decision_noup( cb, 15, 1 ); x264_cabac_encode_decision_noup( cb, 17, 1 ); if( h->mb.pic.i_fref[0] > 1 ) { cabac_ref_p( h, cb, 0 ); cabac_ref_p( h, cb, 8 ); } cabac_mvd( h, cb, 0, 0, 4, 2 ); cabac_mvd( h, cb, 0, 8, 4, 2 ); } else //if( h->mb.i_partition == D_8x16 ) { x264_cabac_encode_decision_noup( cb, 15, 1 ); x264_cabac_encode_decision_noup( cb, 17, 0 ); if( h->mb.pic.i_fref[0] > 1 ) { cabac_ref_p( h, cb, 0 ); cabac_ref_p( h, cb, 4 ); } cabac_mvd( h, cb, 0, 0, 2, 4 ); cabac_mvd( h, cb, 0, 4, 2, 4 ); } } else if( i_mb_type == P_8x8 ) { x264_cabac_encode_decision_noup( cb, 14, 0 ); x264_cabac_encode_decision_noup( cb, 15, 0 ); x264_cabac_encode_decision_noup( cb, 16, 1 ); /* sub mb type */ for( int i = 0; i < 4; i++ ) cabac_subpartition_p( cb, h->mb.i_sub_partition[i] ); /* ref 0 */ if( h->mb.pic.i_fref[0] > 1 ) { cabac_ref_p( h, cb, 0 ); cabac_ref_p( h, cb, 4 ); cabac_ref_p( h, cb, 8 ); cabac_ref_p( h, cb, 12 ); } for( int i = 0; i < 4; i++ ) cabac_8x8_mvd( h, cb, i ); } else /* intra */ cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_P, chroma ); } static ALWAYS_INLINE void cabac_mb_header_b( x264_t *h, x264_cabac_t *cb, int i_mb_type, int chroma ) { int ctx = 0; if( (h->mb.i_neighbour & MB_LEFT) && h->mb.i_mb_type_left[0] != B_SKIP && h->mb.i_mb_type_left[0] != B_DIRECT ) ctx++; if( (h->mb.i_neighbour & MB_TOP) && h->mb.i_mb_type_top != B_SKIP && h->mb.i_mb_type_top != B_DIRECT ) ctx++; if( i_mb_type == B_DIRECT ) { x264_cabac_encode_decision_noup( cb, 27+ctx, 0 ); return; } x264_cabac_encode_decision_noup( cb, 27+ctx, 1 ); if( i_mb_type == B_8x8 ) { x264_cabac_encode_decision_noup( cb, 27+3, 1 ); x264_cabac_encode_decision_noup( cb, 27+4, 1 ); x264_cabac_encode_decision( cb, 27+5, 1 ); x264_cabac_encode_decision( cb, 27+5, 1 ); x264_cabac_encode_decision_noup( cb, 27+5, 1 ); /* sub mb type */ for( int i = 0; i < 4; i++ ) cabac_subpartition_b( cb, h->mb.i_sub_partition[i] ); /* ref */ if( h->mb.pic.i_fref[0] > 1 ) for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) cabac_ref_b( h, cb, 0, 4*i ); if( h->mb.pic.i_fref[1] > 1 ) for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) cabac_ref_b( h, cb, 1, 4*i ); for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) cabac_mvd( h, cb, 0, 4*i, 2, 2 ); for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) cabac_mvd( h, cb, 1, 4*i, 2, 2 ); } else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI ) { /* All B modes */ static const uint8_t i_mb_bits[9*3] = { 0x31, 0x29, 0x4, /* L0 L0 */ 0x35, 0x2d, 0, /* L0 L1 */ 0x43, 0x63, 0, /* L0 BI */ 0x3d, 0x2f, 0, /* L1 L0 */ 0x39, 0x25, 0x6, /* L1 L1 */ 0x53, 0x73, 0, /* L1 BI */ 0x4b, 0x6b, 0, /* BI L0 */ 0x5b, 0x7b, 0, /* BI L1 */ 0x47, 0x67, 0x21 /* BI BI */ }; const int idx = (i_mb_type - B_L0_L0) * 3 + (h->mb.i_partition - D_16x8); int bits = i_mb_bits[idx]; x264_cabac_encode_decision_noup( cb, 27+3, bits&1 ); x264_cabac_encode_decision( cb, 27+5-(bits&1), (bits>>1)&1 ); bits >>= 2; if( bits != 1 ) { x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; x264_cabac_encode_decision( cb, 27+5, bits&1 ); bits >>= 1; if( bits != 1 ) x264_cabac_encode_decision_noup( cb, 27+5, bits&1 ); } const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type]; if( h->mb.pic.i_fref[0] > 1 ) { if( b_list[0][0] ) cabac_ref_b( h, cb, 0, 0 ); if( b_list[0][1] && h->mb.i_partition != D_16x16 ) cabac_ref_b( h, cb, 0, 8 >> (h->mb.i_partition == D_8x16) ); } if( h->mb.pic.i_fref[1] > 1 ) { if( b_list[1][0] ) cabac_ref_b( h, cb, 1, 0 ); if( b_list[1][1] && h->mb.i_partition != D_16x16 ) cabac_ref_b( h, cb, 1, 8 >> (h->mb.i_partition == D_8x16) ); } for( int i_list = 0; i_list < 2; i_list++ ) { if( h->mb.i_partition == D_16x16 ) { if( b_list[i_list][0] ) cabac_mvd( h, cb, i_list, 0, 4, 4 ); } else if( h->mb.i_partition == D_16x8 ) { if( b_list[i_list][0] ) cabac_mvd( h, cb, i_list, 0, 4, 2 ); if( b_list[i_list][1] ) cabac_mvd( h, cb, i_list, 8, 4, 2 ); } else //if( h->mb.i_partition == D_8x16 ) { if( b_list[i_list][0] ) cabac_mvd( h, cb, i_list, 0, 2, 4 ); if( b_list[i_list][1] ) cabac_mvd( h, cb, i_list, 4, 2, 4 ); } } } else /* intra */ cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_B, chroma ); } static ALWAYS_INLINE int cabac_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra, int b_dc ) { static const uint16_t base_ctx[14] = {85,89,93,97,101,1012,460,464,468,1016,472,476,480,1020}; if( b_dc ) { i_idx -= LUMA_DC; if( i_cat == DCT_CHROMA_DC ) { int i_nza = h->mb.cache.i_cbp_left != -1 ? (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1 : b_intra; int i_nzb = h->mb.cache.i_cbp_top != -1 ? (h->mb.cache.i_cbp_top >> (8 + i_idx)) & 1 : b_intra; return base_ctx[i_cat] + 2*i_nzb + i_nza; } else { int i_nza = (h->mb.cache.i_cbp_left >> (8 + i_idx)) & 1; int i_nzb = (h->mb.cache.i_cbp_top >> (8 + i_idx)) & 1; return base_ctx[i_cat] + 2*i_nzb + i_nza; } } else { int i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1]; int i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8]; if( x264_constant_p(b_intra) && !b_intra ) return base_ctx[i_cat] + ((2*i_nzb + i_nza)&0x7f); else { i_nza &= 0x7f + (b_intra << 7); i_nzb &= 0x7f + (b_intra << 7); return base_ctx[i_cat] + 2*!!i_nzb + !!i_nza; } } } // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). /* map node ctx => cabac ctx for level=1 */ static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; /* map node ctx => cabac ctx for level>1 */ static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; /* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */ static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 }; static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after coding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, /* update node ctx after coding a level>1 */ { 4, 4, 4, 4, 5, 6, 7, 7 } }; #if !RDO_SKIP_BS static ALWAYS_INLINE void cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc ) { int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; int coeff_idx = -1, node_ctx = 0; int last = h->quantf.coeff_last[ctx_block_cat]( l ); const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; dctcoef coeffs[64]; #define WRITE_SIGMAP( sig_off, last_off )\ {\ int i = 0;\ while( 1 )\ {\ if( l[i] )\ {\ coeffs[++coeff_idx] = l[i];\ x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\ if( i == last )\ {\ x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\ break;\ }\ else\ x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\ }\ else\ x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\ if( ++i == count_m1 )\ {\ coeffs[++coeff_idx] = l[i];\ break;\ }\ }\ } if( chroma422dc ) { int count_m1 = 7; WRITE_SIGMAP( x264_coeff_flag_offset_chroma_422_dc[i], x264_coeff_flag_offset_chroma_422_dc[i] ) } else { int count_m1 = x264_count_cat_m1[ctx_block_cat]; if( count_m1 == 63 ) { const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; WRITE_SIGMAP( sig_offset[i], x264_last_coeff_flag_offset_8x8[i] ) } else WRITE_SIGMAP( i, i ) } do { /* write coeff_abs - 1 */ int coeff = coeffs[coeff_idx]; int abs_coeff = abs(coeff); int coeff_sign = coeff >> 31; int ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; if( abs_coeff > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); ctx = levelgt1_ctx[node_ctx] + ctx_level; for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- ) x264_cabac_encode_decision( cb, ctx, 1 ); if( abs_coeff < 15 ) x264_cabac_encode_decision( cb, ctx, 0 ); else x264_cabac_encode_ue_bypass( cb, 0, abs_coeff - 15 ); node_ctx = coeff_abs_level_transition[1][node_ctx]; } else { x264_cabac_encode_decision( cb, ctx, 0 ); node_ctx = coeff_abs_level_transition[0][node_ctx]; } x264_cabac_encode_bypass( cb, coeff_sign ); } while( --coeff_idx >= 0 ); } void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 ); } static ALWAYS_INLINE void cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { #if ARCH_X86_64 && HAVE_MMX h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); #endif } static void cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { /* Template a version specifically for chroma 4:2:2 DC in order to avoid * slowing down everything else due to the added complexity. */ cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 1 ); } #define cabac_block_residual_8x8( h, cb, cat, l ) cabac_block_residual( h, cb, cat, l ) #else /* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */ static ALWAYS_INLINE void cabac_block_residual_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc ) { const uint8_t *sig_offset = x264_significant_coeff_flag_offset_8x8[MB_INTERLACED]; int ctx_sig = x264_significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_last = x264_last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_level = x264_coeff_abs_level_m1_offset[ctx_block_cat]; int last = h->quantf.coeff_last[ctx_block_cat]( l ); int coeff_abs = abs(l[last]); int ctx = coeff_abs_level1_ctx[0] + ctx_level; int node_ctx; const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; if( last != (b_8x8 ? 63 : chroma422dc ? 7 : x264_count_cat_m1[ctx_block_cat]) ) { x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[last] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[last] : last), 1 ); } if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); ctx = levelgt1_ctx[0] + ctx_level; if( coeff_abs < 15 ) { cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]]; cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]]; cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]]; x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][0]; } else { x264_cabac_encode_decision( cb, ctx, 0 ); node_ctx = coeff_abs_level_transition[0][0]; x264_cabac_encode_bypass( cb, 0 ); // sign } for( int i = last-1; i >= 0; i-- ) { if( l[i] ) { coeff_abs = abs(l[i]); x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 1 ); x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? x264_last_coeff_flag_offset_8x8[i] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 ); ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); ctx = levelgt1_ctx[node_ctx] + ctx_level; if( coeff_abs < 15 ) { cb->f8_bits_encoded += x264_cabac_size_unary[coeff_abs-1][cb->state[ctx]]; cb->state[ctx] = x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx]]; } else { cb->f8_bits_encoded += x264_cabac_size_unary[14][cb->state[ctx]]; cb->state[ctx] = x264_cabac_transition_unary[14][cb->state[ctx]]; x264_cabac_encode_ue_bypass( cb, 0, coeff_abs - 15 ); } node_ctx = coeff_abs_level_transition[1][node_ctx]; } else { x264_cabac_encode_decision( cb, ctx, 0 ); node_ctx = coeff_abs_level_transition[0][node_ctx]; x264_cabac_encode_bypass( cb, 0 ); } } else x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : chroma422dc ? x264_coeff_flag_offset_chroma_422_dc[i] : i), 0 ); } } void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { cabac_block_residual_internal( h, cb, ctx_block_cat, l, 1, 0 ); } void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0, 0 ); } static ALWAYS_INLINE void cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { #if ARCH_X86_64 && HAVE_MMX h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l ); #endif } static ALWAYS_INLINE void cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { #if ARCH_X86_64 && HAVE_MMX h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l ); #endif } static void cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { cabac_block_residual_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); } #endif #define cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, b_dc, name )\ do\ {\ int ctxidxinc = cabac_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra, b_dc );\ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\ {\ x264_cabac_encode_decision( cb, ctxidxinc, 1 );\ cabac_block_residual##name( h, cb, ctx_block_cat, l );\ }\ else\ x264_cabac_encode_decision( cb, ctxidxinc, 0 );\ } while( 0 ) #define cabac_block_residual_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 1, ) #define cabac_block_residual_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, ) #define cabac_block_residual_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ cabac_block_residual_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, 0, _8x8 ) #define cabac_block_residual_422_dc_cbf( h, cb, ch, b_intra )\ cabac_block_residual_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, 1, _422_dc ) static ALWAYS_INLINE void macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma ) { const int i_mb_type = h->mb.i_type; #if !RDO_SKIP_BS const int i_mb_pos_start = x264_cabac_pos( cb ); int i_mb_pos_tex; if( SLICE_MBAFF && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { cabac_field_decoding_flag( h, cb ); } #endif if( h->sh.i_type == SLICE_TYPE_P ) cabac_mb_header_p( h, cb, i_mb_type, chroma ); else if( h->sh.i_type == SLICE_TYPE_B ) cabac_mb_header_b( h, cb, i_mb_type, chroma ); else //if( h->sh.i_type == SLICE_TYPE_I ) cabac_mb_header_i( h, cb, i_mb_type, SLICE_TYPE_I, chroma ); #if !RDO_SKIP_BS i_mb_pos_tex = x264_cabac_pos( cb ); h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; if( i_mb_type == I_PCM ) { bs_t s; bs_init( &s, cb->p, cb->p_end - cb->p ); for( int p = 0; p < plane_count; p++ ) for( int i = 0; i < 256; i++ ) bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] ); if( chroma ) for( int ch = 1; ch < 3; ch++ ) for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ ) for( int j = 0; j < 8; j++ ) bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); bs_flush( &s ); cb->p = s.p; x264_cabac_encode_init_core( cb ); h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; return; } #endif if( i_mb_type != I_16x16 ) { cabac_cbp_luma( h, cb ); if( chroma ) cabac_cbp_chroma( h, cb ); } if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma ) { cabac_transform_size( h, cb ); } if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 ) { const int b_intra = IS_INTRA( i_mb_type ); cabac_qp_delta( h, cb ); /* write residual */ if( i_mb_type == I_16x16 ) { /* DC Luma */ for( int p = 0; p < plane_count; p++ ) { cabac_block_residual_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 ); /* AC Luma */ if( h->mb.i_cbp_luma ) for( int i = p*16; i < p*16+16; i++ ) cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_AC][p], i, h->dct.luma4x4[i]+1, 1 ); } } else if( h->mb.b_transform_8x8 ) { if( plane_count == 3 ) { ALIGNED_4( uint8_t nnzbak[3][8] ); /* Stupid nnz munging in the case that neighbors don't have * 8x8 transform enabled. */ #define BACKUP( dst, src, res )\ dst = src;\ src = res; #define RESTORE( dst, src, res )\ src = dst; #define MUNGE_8x8_NNZ( MUNGE )\ if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] && !(h->mb.cbp[h->mb.i_mb_left_xy[0]] & 0x1000) )\ {\ MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x00 )\ MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x00 )\ MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x00 )\ MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x00 )\ MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x00 )\ MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x00 )\ }\ if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] && !(h->mb.cbp[h->mb.i_mb_left_xy[1]] & 0x1000) )\ {\ MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x00 )\ MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x00 )\ MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x00 )\ MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x00 )\ MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x00 )\ MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x00 )\ }\ if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] && !(h->mb.cbp[h->mb.i_mb_top_xy] & 0x1000) )\ {\ MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x00000000U )\ MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x00000000U )\ MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x00000000U )\ } MUNGE_8x8_NNZ( BACKUP ) for( int p = 0; p < 3; p++ ) FOREACH_BIT( i, 0, h->mb.i_cbp_luma ) cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i*4+p*16, h->dct.luma8x8[i+p*4], b_intra ); MUNGE_8x8_NNZ( RESTORE ) } else { FOREACH_BIT( i, 0, h->mb.i_cbp_luma ) cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i] ); } } else { for( int p = 0; p < plane_count; p++ ) FOREACH_BIT( i8x8, 0, h->mb.i_cbp_luma ) for( int i = 0; i < 4; i++ ) cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i+i8x8*4+p*16, h->dct.luma4x4[i+i8x8*4+p*16], b_intra ); } if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */ { if( CHROMA_FORMAT == CHROMA_422 ) { cabac_block_residual_422_dc_cbf( h, cb, 0, b_intra ); cabac_block_residual_422_dc_cbf( h, cb, 1, b_intra ); } else { cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra ); cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra ); } if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */ { int step = 8 << CHROMA_V_SHIFT; for( int i = 16; i < 3*16; i += step ) for( int j = i; j < i+4; j++ ) cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra ); } } } #if !RDO_SKIP_BS h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; #endif } void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) { if( CHROMA444 ) macroblock_write_cabac_internal( h, cb, 3, 0 ); else if( CHROMA_FORMAT ) macroblock_write_cabac_internal( h, cb, 1, 1 ); else macroblock_write_cabac_internal( h, cb, 1, 0 ); } #if RDO_SKIP_BS /***************************************************************************** * RD only; doesn't generate a valid bitstream * doesn't write cbp or chroma dc (I don't know how much this matters) * doesn't write ref (never varies between calls, so no point in doing so) * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO * works on all partition sizes except 16x16 *****************************************************************************/ static void partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel ) { const int i_mb_type = h->mb.i_type; int b_8x16 = h->mb.i_partition == D_8x16; int plane_count = CHROMA444 ? 3 : 1; if( i_mb_type == P_8x8 ) { cabac_8x8_mvd( h, cb, i8 ); cabac_subpartition_p( cb, h->mb.i_sub_partition[i8] ); } else if( i_mb_type == P_L0 ) cabac_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2< B_DIRECT && i_mb_type < B_8x8 ) { if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cabac_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<>b_8x16, 2<mb.i_sub_partition[i8] ] ) cabac_mvd( h, cb, 0, 4*i8, 2, 2 ); if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] ) cabac_mvd( h, cb, 1, 4*i8, 2, 2 ); } for( int j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) { if( h->mb.i_cbp_luma & (1 << i8) ) { if( h->mb.b_transform_8x8 ) { if( CHROMA444 ) for( int p = 0; p < 3; p++ ) cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 0 ); else cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] ); } else for( int p = 0; p < plane_count; p++ ) for( int i4 = 0; i4 < 4; i4++ ) cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16], 0 ); } if( h->mb.i_cbp_chroma ) { if( CHROMA_FORMAT == CHROMA_422 ) { int offset = (5*i8) & 0x09; cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 ); cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 ); cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 ); cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 ); } else { cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 ); cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 ); } } i8 += x264_pixel_size[i_pixel].h >> 3; } } static void subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel ) { int b_8x4 = i_pixel == PIXEL_8x4; int plane_count = CHROMA444 ? 3 : 1; if( i_pixel == PIXEL_4x4 ) cabac_mvd( h, cb, 0, i4, 1, 1 ); else cabac_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 ); for( int p = 0; p < plane_count; p++ ) { cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4, h->dct.luma4x4[p*16+i4], 0 ); if( i_pixel != PIXEL_4x4 ) cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4], 0 ); } } static void partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode ) { const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 ); i_mode = x264_mb_pred_mode4x4_fix( i_mode ); cabac_intra4x4_pred_mode( cb, i_pred, i_mode ); cabac_cbp_luma( h, cb ); if( h->mb.i_cbp_luma & (1 << i8) ) { if( CHROMA444 ) for( int p = 0; p < 3; p++ ) cabac_block_residual_8x8_cbf( h, cb, ctx_cat_plane[DCT_LUMA_8x8][p], i8*4+p*16, h->dct.luma8x8[i8+p*4], 1 ); else cabac_block_residual_8x8( h, cb, DCT_LUMA_8x8, h->dct.luma8x8[i8] ); } } static void partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode ) { const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 ); int plane_count = CHROMA444 ? 3 : 1; i_mode = x264_mb_pred_mode4x4_fix( i_mode ); cabac_intra4x4_pred_mode( cb, i_pred, i_mode ); for( int p = 0; p < plane_count; p++ ) cabac_block_residual_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 ); } static void chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) { cabac_intra_chroma_pred_mode( h, cb ); cabac_cbp_chroma( h, cb ); if( h->mb.i_cbp_chroma ) { if( CHROMA_FORMAT == CHROMA_422 ) { cabac_block_residual_422_dc_cbf( h, cb, 0, 1 ); cabac_block_residual_422_dc_cbf( h, cb, 1, 1 ); } else { cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 ); cabac_block_residual_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 ); } if( h->mb.i_cbp_chroma == 2 ) { int step = 8 << CHROMA_V_SHIFT; for( int i = 16; i < 3*16; i += step ) for( int j = i; j < i+4; j++ ) cabac_block_residual_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 ); } } } #endif x264-master/encoder/cavlc.c000066400000000000000000000645771502133446700157550ustar00rootroot00000000000000/***************************************************************************** * cavlc.c: cavlc bitstream writing ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #ifndef RDO_SKIP_BS #define RDO_SKIP_BS 0 #endif /* [400,420][inter,intra] */ static const uint8_t cbp_to_golomb[2][2][48] = { {{ 0, 1, 2, 5, 3, 6, 14, 10, 4, 15, 7, 11, 8, 12, 13, 9 }, { 1, 10, 11, 6, 12, 7, 14, 2, 13, 15, 8, 3, 9, 4, 5, 0 }}, {{ 0, 2, 3, 7, 4, 8, 17, 13, 5, 18, 9, 14, 10, 15, 16, 11, 1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19, 6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 }, { 3, 29, 30, 17, 31, 18, 37, 8, 32, 38, 19, 9, 20, 10, 11, 2, 16, 33, 34, 21, 35, 22, 39, 4, 36, 40, 23, 5, 24, 6, 7, 1, 41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15, 0 }} }; static const uint8_t mb_type_b_to_golomb[3][9]= { { 4, 8, 12, 10, 6, 14, 16, 18, 20 }, /* D_16x8 */ { 5, 9, 13, 11, 7, 15, 17, 19, 21 }, /* D_8x16 */ { 1, -1, -1, -1, 2, -1, -1, -1, 3 } /* D_16x16 */ }; static const uint8_t subpartition_p_to_golomb[4]= { 3, 1, 2, 0 }; static const uint8_t subpartition_b_to_golomb[13]= { 10, 4, 5, 1, 11, 6, 7, 2, 12, 8, 9, 3, 0 }; #define bs_write_vlc(s,v) bs_write( s, (v).i_size, (v).i_bits ) /**************************************************************************** * x264_cavlc_block_residual: ****************************************************************************/ static inline int cavlc_block_residual_escape( x264_t *h, int i_suffix_length, int level ) { bs_t *s = &h->out.bs; static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff }; int i_level_prefix = 15; int mask = level >> 31; int abs_level = (level^mask)-mask; int i_level_code = abs_level*2-mask-2; if( ( i_level_code >> i_suffix_length ) < 15 ) { bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length, (1<= 1<<12 ) { if( h->sps->i_profile_idc >= PROFILE_HIGH ) { while( i_level_code >= 1<<(i_level_prefix-3) ) { i_level_code -= 1<<(i_level_prefix-3); i_level_prefix++; } } else { #if RDO_SKIP_BS /* Weight highly against overflows. */ s->i_bits_encoded += 2000; #else /* We've had an overflow; note it down and re-encode the MB later. */ h->mb.b_overflow = 1; #endif } } bs_write( s, i_level_prefix + 1, 1 ); bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) ); } if( i_suffix_length == 0 ) i_suffix_length++; if( abs_level > next_suffix[i_suffix_length] ) i_suffix_length++; return i_suffix_length; } static int cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dctcoef *l, int nC ) { bs_t *s = &h->out.bs; static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0}; static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64}; x264_run_level_t runlevel; int i_total, i_trailing, i_total_zero, i_suffix_length; unsigned int i_sign; /* level and run and total */ i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel ); x264_prefetch( &x264_run_before[runlevel.mask] ); i_total_zero = runlevel.last + 1 - i_total; /* branchless i_trailing calculation */ runlevel.level[i_total+0] = 2; runlevel.level[i_total+1] = 2; i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2) | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4); i_trailing = ctz_index[i_trailing]; i_sign = ((runlevel.level[2] >> 31) & 1) | ((runlevel.level[1] >> 31) & 2) | ((runlevel.level[0] >> 31) & 4); i_sign >>= 3-i_trailing; /* total/trailing */ bs_write_vlc( s, x264_coeff_token[nC][i_total-1][i_trailing] ); i_suffix_length = i_total > 10 && i_trailing < 3; bs_write( s, i_trailing, i_sign ); if( i_trailing < i_total ) { int val = runlevel.level[i_trailing]; int val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2; val -= ((val>>31)|1) & -(i_trailing < 3); /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */ val += LEVEL_TABLE_SIZE/2; if( (unsigned)val_original < LEVEL_TABLE_SIZE ) { bs_write_vlc( s, x264_level_token[i_suffix_length][val] ); i_suffix_length = x264_level_token[i_suffix_length][val_original].i_next; } else i_suffix_length = cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 ); for( int i = i_trailing+1; i < i_total; i++ ) { val = runlevel.level[i] + LEVEL_TABLE_SIZE/2; if( (unsigned)val < LEVEL_TABLE_SIZE ) { bs_write_vlc( s, x264_level_token[i_suffix_length][val] ); i_suffix_length = x264_level_token[i_suffix_length][val].i_next; } else i_suffix_length = cavlc_block_residual_escape( h, i_suffix_length, val-LEVEL_TABLE_SIZE/2 ); } } if( ctx_block_cat == DCT_CHROMA_DC ) { if( i_total < 8>>CHROMA_V_SHIFT ) { vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero] : x264_total_zeros_2x4_dc[i_total-1][i_total_zero]; bs_write_vlc( s, total_zeros ); } } else if( (uint8_t)i_total < count_cat[ctx_block_cat] ) bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] ); int zero_run_code = x264_run_before[runlevel.mask]; bs_write( s, zero_run_code&0x1f, zero_run_code>>5 ); return i_total; } static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3}; #define x264_cavlc_block_residual(h,cat,idx,l)\ {\ int nC = cat == DCT_CHROMA_DC ? 5 - CHROMA_V_SHIFT\ : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\ uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\ if( !*nnz )\ bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\ else\ *nnz = cavlc_block_residual_internal(h,cat,l,nC);\ } static void cavlc_qp_delta( x264_t *h ) { bs_t *s = &h->out.bs; int i_dqp = h->mb.i_qp - h->mb.i_last_qp; /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely * flat background area. Don't do this if it would raise the quantizer, since that could * cause unexpected deblocking artifacts. */ if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && !h->mb.cache.non_zero_count[x264_scan8[LUMA_DC]] && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] && !h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] && h->mb.i_qp > h->mb.i_last_qp ) { #if !RDO_SKIP_BS h->mb.i_qp = h->mb.i_last_qp; #endif i_dqp = 0; } if( i_dqp ) { if( i_dqp < -(QP_MAX_SPEC+1)/2 ) i_dqp += QP_MAX_SPEC+1; else if( i_dqp > QP_MAX_SPEC/2 ) i_dqp -= QP_MAX_SPEC+1; } bs_write_se( s, i_dqp ); } static void cavlc_mvd( x264_t *h, int i_list, int idx, int width ) { bs_t *s = &h->out.bs; ALIGNED_4( int16_t mvp[2] ); x264_mb_predict_mv( h, i_list, idx, width, mvp ); bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] ); bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] ); } static inline void cavlc_8x8_mvd( x264_t *h, int i ) { switch( h->mb.i_sub_partition[i] ) { case D_L0_8x8: cavlc_mvd( h, 0, 4*i, 2 ); break; case D_L0_8x4: cavlc_mvd( h, 0, 4*i+0, 2 ); cavlc_mvd( h, 0, 4*i+2, 2 ); break; case D_L0_4x8: cavlc_mvd( h, 0, 4*i+0, 1 ); cavlc_mvd( h, 0, 4*i+1, 1 ); break; case D_L0_4x4: cavlc_mvd( h, 0, 4*i+0, 1 ); cavlc_mvd( h, 0, 4*i+1, 1 ); cavlc_mvd( h, 0, 4*i+2, 1 ); cavlc_mvd( h, 0, 4*i+3, 1 ); break; } } static ALWAYS_INLINE void cavlc_macroblock_luma_residual( x264_t *h, int plane_count ) { if( h->mb.b_transform_8x8 ) { /* shuffle 8x8 dct coeffs into 4x4 lists */ for( int p = 0; p < plane_count; p++ ) for( int i8 = 0; i8 < 4; i8++ ) if( h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] ) h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[p*16+i8*4], h->dct.luma8x8[p*4+i8], &h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4]] ); } for( int p = 0; p < plane_count; p++ ) FOREACH_BIT( i8, 0, h->mb.i_cbp_luma ) for( int i4 = 0; i4 < 4; i4++ ) x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] ); } #if RDO_SKIP_BS static ALWAYS_INLINE void cavlc_partition_luma_residual( x264_t *h, int i8, int p ) { if( h->mb.b_transform_8x8 && h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] ) h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4+p*16], h->dct.luma8x8[i8+p*4], &h->mb.cache.non_zero_count[x264_scan8[i8*4+p*16]] ); if( h->mb.i_cbp_luma & (1 << i8) ) for( int i4 = 0; i4 < 4; i4++ ) x264_cavlc_block_residual( h, DCT_LUMA_4x4, i4+i8*4+p*16, h->dct.luma4x4[i4+i8*4+p*16] ); } #endif static void cavlc_mb_header_i( x264_t *h, int i_mb_type, int i_mb_i_offset, int chroma ) { bs_t *s = &h->out.bs; if( i_mb_type == I_16x16 ) { bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] + h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) ); } else //if( i_mb_type == I_4x4 || i_mb_type == I_8x8 ) { int di = i_mb_type == I_8x8 ? 4 : 1; bs_write_ue( s, i_mb_i_offset + 0 ); if( h->pps->b_transform_8x8_mode ) bs_write1( s, h->mb.b_transform_8x8 ); /* Prediction: Luma */ for( int i = 0; i < 16; i += di ) { int i_pred = x264_mb_predict_intra4x4_mode( h, i ); int i_mode = x264_mb_pred_mode4x4_fix( h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] ); if( i_pred == i_mode ) bs_write1( s, 1 ); /* b_prev_intra4x4_pred_mode */ else bs_write( s, 4, i_mode - (i_mode > i_pred) ); } } if( chroma ) bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] ); } static ALWAYS_INLINE void cavlc_mb_header_p( x264_t *h, int i_mb_type, int chroma ) { bs_t *s = &h->out.bs; if( i_mb_type == P_L0 ) { if( h->mb.i_partition == D_16x16 ) { bs_write1( s, 1 ); if( h->mb.pic.i_fref[0] > 1 ) bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); cavlc_mvd( h, 0, 0, 4 ); } else if( h->mb.i_partition == D_16x8 ) { bs_write_ue( s, 1 ); if( h->mb.pic.i_fref[0] > 1 ) { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] ); } cavlc_mvd( h, 0, 0, 4 ); cavlc_mvd( h, 0, 8, 4 ); } else if( h->mb.i_partition == D_8x16 ) { bs_write_ue( s, 2 ); if( h->mb.pic.i_fref[0] > 1 ) { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] ); } cavlc_mvd( h, 0, 0, 2 ); cavlc_mvd( h, 0, 4, 2 ); } } else if( i_mb_type == P_8x8 ) { int b_sub_ref; if( (h->mb.cache.ref[0][x264_scan8[0]] | h->mb.cache.ref[0][x264_scan8[ 4]] | h->mb.cache.ref[0][x264_scan8[8]] | h->mb.cache.ref[0][x264_scan8[12]]) == 0 ) { bs_write_ue( s, 4 ); b_sub_ref = 0; } else { bs_write_ue( s, 3 ); b_sub_ref = 1; } /* sub mb type */ if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) for( int i = 0; i < 4; i++ ) bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i] ] ); else bs_write( s, 4, 0xf ); /* ref0 */ if( b_sub_ref ) { bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[0]] ); bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[4]] ); bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[8]] ); bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[12]] ); } for( int i = 0; i < 4; i++ ) cavlc_8x8_mvd( h, i ); } else //if( IS_INTRA( i_mb_type ) ) cavlc_mb_header_i( h, i_mb_type, 5, chroma ); } static ALWAYS_INLINE void cavlc_mb_header_b( x264_t *h, int i_mb_type, int chroma ) { bs_t *s = &h->out.bs; if( i_mb_type == B_8x8 ) { bs_write_ue( s, 22 ); /* sub mb type */ for( int i = 0; i < 4; i++ ) bs_write_ue( s, subpartition_b_to_golomb[ h->mb.i_sub_partition[i] ] ); /* ref */ if( h->mb.pic.i_fref[0] > 1 ) for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) bs_write_te( s, h->mb.pic.i_fref[0] - 1, h->mb.cache.ref[0][x264_scan8[i*4]] ); if( h->mb.pic.i_fref[1] > 1 ) for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) bs_write_te( s, h->mb.pic.i_fref[1] - 1, h->mb.cache.ref[1][x264_scan8[i*4]] ); /* mvd */ for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] ) cavlc_mvd( h, 0, 4*i, 2 ); for( int i = 0; i < 4; i++ ) if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] ) cavlc_mvd( h, 1, 4*i, 2 ); } else if( i_mb_type >= B_L0_L0 && i_mb_type <= B_BI_BI ) { /* All B mode */ /* Motion Vector */ const uint8_t (*b_list)[2] = x264_mb_type_list_table[i_mb_type]; const int i_ref0_max = h->mb.pic.i_fref[0] - 1; const int i_ref1_max = h->mb.pic.i_fref[1] - 1; bs_write_ue( s, mb_type_b_to_golomb[ h->mb.i_partition - D_16x8 ][ i_mb_type - B_L0_L0 ] ); if( h->mb.i_partition == D_16x16 ) { if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[0]] ); if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[0]] ); if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 4 ); if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 4 ); } else { if( i_ref0_max && b_list[0][0] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[ 0]] ); if( i_ref0_max && b_list[0][1] ) bs_write_te( s, i_ref0_max, h->mb.cache.ref[0][x264_scan8[12]] ); if( i_ref1_max && b_list[1][0] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[ 0]] ); if( i_ref1_max && b_list[1][1] ) bs_write_te( s, i_ref1_max, h->mb.cache.ref[1][x264_scan8[12]] ); if( h->mb.i_partition == D_16x8 ) { if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 4 ); if( b_list[0][1] ) cavlc_mvd( h, 0, 8, 4 ); if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 4 ); if( b_list[1][1] ) cavlc_mvd( h, 1, 8, 4 ); } else //if( h->mb.i_partition == D_8x16 ) { if( b_list[0][0] ) cavlc_mvd( h, 0, 0, 2 ); if( b_list[0][1] ) cavlc_mvd( h, 0, 4, 2 ); if( b_list[1][0] ) cavlc_mvd( h, 1, 0, 2 ); if( b_list[1][1] ) cavlc_mvd( h, 1, 4, 2 ); } } } else if( i_mb_type == B_DIRECT ) bs_write1( s, 1 ); else //if( IS_INTRA( i_mb_type ) ) cavlc_mb_header_i( h, i_mb_type, 23, chroma ); } /***************************************************************************** * x264_macroblock_write: *****************************************************************************/ void x264_macroblock_write_cavlc( x264_t *h ) { bs_t *s = &h->out.bs; const int i_mb_type = h->mb.i_type; int plane_count = CHROMA444 ? 3 : 1; int chroma = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422; #if RDO_SKIP_BS s->i_bits_encoded = 0; #else const int i_mb_pos_start = bs_pos( s ); int i_mb_pos_tex; #endif if( SLICE_MBAFF && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { bs_write1( s, MB_INTERLACED ); #if !RDO_SKIP_BS h->mb.field_decoding_flag = MB_INTERLACED; #endif } #if !RDO_SKIP_BS if( i_mb_type == I_PCM ) { static const uint8_t i_offsets[3] = {5,23,0}; uint8_t *p_start = s->p_start; bs_write_ue( s, i_offsets[h->sh.i_type] + 25 ); i_mb_pos_tex = bs_pos( s ); h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; bs_align_0( s ); for( int p = 0; p < plane_count; p++ ) for( int i = 0; i < 256; i++ ) bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] ); if( chroma ) for( int ch = 1; ch < 3; ch++ ) for( int i = 0; i < 16>>CHROMA_V_SHIFT; i++ ) for( int j = 0; j < 8; j++ ) bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); bs_init( s, s->p, s->p_end - s->p ); s->p_start = p_start; h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex; return; } #endif if( h->sh.i_type == SLICE_TYPE_P ) cavlc_mb_header_p( h, i_mb_type, chroma ); else if( h->sh.i_type == SLICE_TYPE_B ) cavlc_mb_header_b( h, i_mb_type, chroma ); else //if( h->sh.i_type == SLICE_TYPE_I ) cavlc_mb_header_i( h, i_mb_type, 0, chroma ); #if !RDO_SKIP_BS i_mb_pos_tex = bs_pos( s ); h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; #endif /* Coded block pattern */ if( i_mb_type != I_16x16 ) bs_write_ue( s, cbp_to_golomb[chroma][IS_INTRA(i_mb_type)][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] ); /* transform size 8x8 flag */ if( x264_mb_transform_8x8_allowed( h ) && h->mb.i_cbp_luma ) bs_write1( s, h->mb.b_transform_8x8 ); if( i_mb_type == I_16x16 ) { cavlc_qp_delta( h ); /* DC Luma */ for( int p = 0; p < plane_count; p++ ) { x264_cavlc_block_residual( h, DCT_LUMA_DC, LUMA_DC+p, h->dct.luma16x16_dc[p] ); /* AC Luma */ if( h->mb.i_cbp_luma ) for( int i = p*16; i < p*16+16; i++ ) x264_cavlc_block_residual( h, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1 ); } } else if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma ) { cavlc_qp_delta( h ); cavlc_macroblock_luma_residual( h, plane_count ); } if( h->mb.i_cbp_chroma ) { /* Chroma DC residual present */ x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] ); x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] ); if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */ { int step = 8 << CHROMA_V_SHIFT; for( int i = 16; i < 3*16; i += step ) for( int j = i; j < i+4; j++ ) x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 ); } } #if !RDO_SKIP_BS h->stat.frame.i_tex_bits += bs_pos(s) - i_mb_pos_tex; #endif } #if RDO_SKIP_BS /***************************************************************************** * RD only; doesn't generate a valid bitstream * doesn't write cbp or chroma dc (I don't know how much this matters) * doesn't write ref (never varies between calls, so no point in doing so) * only writes subpartition for p8x8, needed for sub-8x8 mode decision RDO * works on all partition sizes except 16x16 *****************************************************************************/ static int partition_size_cavlc( x264_t *h, int i8, int i_pixel ) { bs_t *s = &h->out.bs; const int i_mb_type = h->mb.i_type; int b_8x16 = h->mb.i_partition == D_8x16; int plane_count = CHROMA444 ? 3 : 1; int j; h->out.bs.i_bits_encoded = 0; if( i_mb_type == P_8x8 ) { cavlc_8x8_mvd( h, i8 ); bs_write_ue( s, subpartition_p_to_golomb[ h->mb.i_sub_partition[i8] ] ); } else if( i_mb_type == P_L0 ) cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 ); else if( i_mb_type > B_DIRECT && i_mb_type < B_8x8 ) { if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) cavlc_mvd( h, 0, 4*i8, 4>>b_8x16 ); if( x264_mb_type_list_table[ i_mb_type ][1][!!i8] ) cavlc_mvd( h, 1, 4*i8, 4>>b_8x16 ); } else //if( i_mb_type == B_8x8 ) { if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] ) cavlc_mvd( h, 0, 4*i8, 2 ); if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] ) cavlc_mvd( h, 1, 4*i8, 2 ); } for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) { for( int p = 0; p < plane_count; p++ ) cavlc_partition_luma_residual( h, i8, p ); if( h->mb.i_cbp_chroma ) { if( CHROMA_FORMAT == CHROMA_422 ) { int offset = (5*i8) & 0x09; x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 ); x264_cavlc_block_residual( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 ); x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 ); x264_cavlc_block_residual( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 ); } else { x264_cavlc_block_residual( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 ); x264_cavlc_block_residual( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 ); } } i8 += x264_pixel_size[i_pixel].h >> 3; } return h->out.bs.i_bits_encoded; } static int subpartition_size_cavlc( x264_t *h, int i4, int i_pixel ) { int plane_count = CHROMA444 ? 3 : 1; int b_8x4 = i_pixel == PIXEL_8x4; h->out.bs.i_bits_encoded = 0; cavlc_mvd( h, 0, i4, 1+b_8x4 ); for( int p = 0; p < plane_count; p++ ) { x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] ); if( i_pixel != PIXEL_4x4 ) x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4+2-b_8x4, h->dct.luma4x4[p*16+i4+2-b_8x4] ); } return h->out.bs.i_bits_encoded; } static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode ) { if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) ) return 1; else return 4; } static int partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode ) { int plane_count = CHROMA444 ? 3 : 1; h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode ); bs_write_ue( &h->out.bs, cbp_to_golomb[!CHROMA444][1][(h->mb.i_cbp_chroma << 4)|h->mb.i_cbp_luma] ); for( int p = 0; p < plane_count; p++ ) cavlc_partition_luma_residual( h, i8, p ); return h->out.bs.i_bits_encoded; } static int partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode ) { int plane_count = CHROMA444 ? 3 : 1; h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode ); for( int p = 0; p < plane_count; p++ ) x264_cavlc_block_residual( h, DCT_LUMA_4x4, p*16+i4, h->dct.luma4x4[p*16+i4] ); return h->out.bs.i_bits_encoded; } static int chroma_size_cavlc( x264_t *h ) { h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] ); if( h->mb.i_cbp_chroma ) { x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] ); x264_cavlc_block_residual( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] ); if( h->mb.i_cbp_chroma == 2 ) { int step = 8 << CHROMA_V_SHIFT; for( int i = 16; i < 3*16; i += step ) for( int j = i; j < i+4; j++ ) x264_cavlc_block_residual( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 ); } } return h->out.bs.i_bits_encoded; } #endif x264-master/encoder/encoder.c000066400000000000000000005535351502133446700163010ustar00rootroot00000000000000/***************************************************************************** * encoder.c: top-level encoder functions ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "set.h" #include "analyse.h" #include "ratecontrol.h" #include "macroblock.h" #include "me.h" #if HAVE_INTEL_DISPATCHER #include "extras/intel_dispatcher.h" #endif //#define DEBUG_MB_TYPE #define bs_write_ue bs_write_ue_big // forward declaration needed for template usage void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ); void x264_macroblock_cache_load_progressive( x264_t *h, int i_mb_x, int i_mb_y ); static int encoder_frame_end( x264_t *h, x264_t *thread_current, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_out ); /**************************************************************************** * ******************************* x264 libs ********************************** * ****************************************************************************/ static double calc_psnr( double sqe, double size ) { double mse = sqe / (PIXEL_MAX*PIXEL_MAX * size); if( mse <= 0.0000000001 ) /* Max 100dB */ return 100; return -10.0 * log10( mse ); } static double calc_ssim_db( double ssim ) { double inv_ssim = 1 - ssim; if( inv_ssim <= 0.0000000001 ) /* Max 100dB */ return 100; return -10.0 * log10( inv_ssim ); } static int threadpool_wait_all( x264_t *h ) { for( int i = 0; i < h->param.i_threads; i++ ) if( h->thread[i]->b_thread_active ) { h->thread[i]->b_thread_active = 0; if( (intptr_t)x264_threadpool_wait( h->threadpool, h->thread[i] ) < 0 ) return -1; } return 0; } static void frame_dump( x264_t *h ) { FILE *f = x264_fopen( h->param.psz_dump_yuv, "r+b" ); if( !f ) return; /* Wait for the threads to finish deblocking */ if( h->param.b_sliced_threads ) threadpool_wait_all( h ); /* Write the frame in display order */ int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * SIZEOF_PIXEL ); if( !fseek( f, (int64_t)h->fdec->i_frame * frame_size, SEEK_SET ) ) { for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) for( int y = 0; y < h->param.i_height; y++ ) fwrite( &h->fdec->plane[p][y*h->fdec->i_stride[p]], SIZEOF_PIXEL, h->param.i_width, f ); if( CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422 ) { int cw = h->param.i_width>>1; int ch = h->param.i_height>>CHROMA_V_SHIFT; pixel *planeu = x264_malloc( 2 * (cw*ch*SIZEOF_PIXEL + 32) ); if( planeu ) { pixel *planev = planeu + cw*ch + 32/SIZEOF_PIXEL; h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); fwrite( planeu, 1, cw*ch*SIZEOF_PIXEL, f ); fwrite( planev, 1, cw*ch*SIZEOF_PIXEL, f ); x264_free( planeu ); } } } fclose( f ); } /* Fill "default" values */ static void slice_header_init( x264_t *h, x264_slice_header_t *sh, x264_sps_t *sps, x264_pps_t *pps, int i_idr_pic_id, int i_frame, int i_qp ) { x264_param_t *param = &h->param; /* First we fill all fields */ sh->sps = sps; sh->pps = pps; sh->i_first_mb = 0; sh->i_last_mb = h->mb.i_mb_count - 1; sh->i_pps_id = pps->i_id; sh->i_frame_num = i_frame; sh->b_mbaff = PARAM_INTERLACED; sh->b_field_pic = 0; /* no field support for now */ sh->b_bottom_field = 0; /* not yet used */ sh->i_idr_pic_id = i_idr_pic_id; /* poc stuff, fixed later */ sh->i_poc = 0; sh->i_delta_poc_bottom = 0; sh->i_delta_poc[0] = 0; sh->i_delta_poc[1] = 0; sh->i_redundant_pic_cnt = 0; h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO && h->param.i_bframe && ( h->param.rc.b_stat_write || !h->param.rc.b_stat_read ); if( !h->mb.b_direct_auto_read && sh->i_type == SLICE_TYPE_B ) { if( h->fref[1][0]->i_poc_l0ref0 == h->fref[0][0]->i_poc ) { if( h->mb.b_direct_auto_write ) sh->b_direct_spatial_mv_pred = ( h->stat.i_direct_score[1] > h->stat.i_direct_score[0] ); else sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL ); } else { h->mb.b_direct_auto_write = 0; sh->b_direct_spatial_mv_pred = 1; } } /* else b_direct_spatial_mv_pred was read from the 2pass statsfile */ sh->b_num_ref_idx_override = 0; sh->i_num_ref_idx_l0_active = 1; sh->i_num_ref_idx_l1_active = 1; sh->b_ref_pic_list_reordering[0] = h->b_ref_reorder[0]; sh->b_ref_pic_list_reordering[1] = h->b_ref_reorder[1]; /* If the ref list isn't in the default order, construct reordering header */ for( int list = 0; list < 2; list++ ) { if( sh->b_ref_pic_list_reordering[list] ) { int pred_frame_num = i_frame; for( int i = 0; i < h->i_ref[list]; i++ ) { int diff = h->fref[list][i]->i_frame_num - pred_frame_num; sh->ref_pic_list_order[list][i].idc = ( diff > 0 ); sh->ref_pic_list_order[list][i].arg = (abs(diff) - 1) & ((1 << sps->i_log2_max_frame_num) - 1); pred_frame_num = h->fref[list][i]->i_frame_num; } } } sh->i_cabac_init_idc = param->i_cabac_init_idc; sh->i_qp = SPEC_QP(i_qp); sh->i_qp_delta = sh->i_qp - pps->i_pic_init_qp; sh->b_sp_for_swidth = 0; sh->i_qs_delta = 0; int deblock_thresh = i_qp + 2 * X264_MIN(param->i_deblocking_filter_alphac0, param->i_deblocking_filter_beta); /* If effective qp <= 15, deblocking would have no effect anyway */ if( param->b_deblocking_filter && (h->mb.b_variable_qp || 15 < deblock_thresh ) ) sh->i_disable_deblocking_filter_idc = param->b_sliced_threads ? 2 : 0; else sh->i_disable_deblocking_filter_idc = 1; sh->i_alpha_c0_offset = param->i_deblocking_filter_alphac0 * 2; sh->i_beta_offset = param->i_deblocking_filter_beta * 2; } static void slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal_ref_idc ) { if( sh->b_mbaff ) { int first_x = sh->i_first_mb % sh->sps->i_mb_width; int first_y = sh->i_first_mb / sh->sps->i_mb_width; assert( (first_y&1) == 0 ); bs_write_ue( s, (2*first_x + sh->sps->i_mb_width*(first_y&~1) + (first_y&1)) >> 1 ); } else bs_write_ue( s, sh->i_first_mb ); bs_write_ue( s, sh->i_type + 5 ); /* same type things */ bs_write_ue( s, sh->i_pps_id ); bs_write( s, sh->sps->i_log2_max_frame_num, sh->i_frame_num & ((1<sps->i_log2_max_frame_num)-1) ); if( !sh->sps->b_frame_mbs_only ) { bs_write1( s, sh->b_field_pic ); if( sh->b_field_pic ) bs_write1( s, sh->b_bottom_field ); } if( sh->i_idr_pic_id >= 0 ) /* NAL IDR */ bs_write_ue( s, sh->i_idr_pic_id ); if( sh->sps->i_poc_type == 0 ) { bs_write( s, sh->sps->i_log2_max_poc_lsb, sh->i_poc & ((1<sps->i_log2_max_poc_lsb)-1) ); if( sh->pps->b_pic_order && !sh->b_field_pic ) bs_write_se( s, sh->i_delta_poc_bottom ); } if( sh->pps->b_redundant_pic_cnt ) bs_write_ue( s, sh->i_redundant_pic_cnt ); if( sh->i_type == SLICE_TYPE_B ) bs_write1( s, sh->b_direct_spatial_mv_pred ); if( sh->i_type == SLICE_TYPE_P || sh->i_type == SLICE_TYPE_B ) { bs_write1( s, sh->b_num_ref_idx_override ); if( sh->b_num_ref_idx_override ) { bs_write_ue( s, sh->i_num_ref_idx_l0_active - 1 ); if( sh->i_type == SLICE_TYPE_B ) bs_write_ue( s, sh->i_num_ref_idx_l1_active - 1 ); } } /* ref pic list reordering */ if( sh->i_type != SLICE_TYPE_I ) { bs_write1( s, sh->b_ref_pic_list_reordering[0] ); if( sh->b_ref_pic_list_reordering[0] ) { for( int i = 0; i < sh->i_num_ref_idx_l0_active; i++ ) { bs_write_ue( s, sh->ref_pic_list_order[0][i].idc ); bs_write_ue( s, sh->ref_pic_list_order[0][i].arg ); } bs_write_ue( s, 3 ); } } if( sh->i_type == SLICE_TYPE_B ) { bs_write1( s, sh->b_ref_pic_list_reordering[1] ); if( sh->b_ref_pic_list_reordering[1] ) { for( int i = 0; i < sh->i_num_ref_idx_l1_active; i++ ) { bs_write_ue( s, sh->ref_pic_list_order[1][i].idc ); bs_write_ue( s, sh->ref_pic_list_order[1][i].arg ); } bs_write_ue( s, 3 ); } } sh->b_weighted_pred = 0; if( sh->pps->b_weighted_pred && sh->i_type == SLICE_TYPE_P ) { sh->b_weighted_pred = sh->weight[0][0].weightfn || sh->weight[0][1].weightfn || sh->weight[0][2].weightfn; /* pred_weight_table() */ bs_write_ue( s, sh->weight[0][0].i_denom ); /* luma_log2_weight_denom */ if( sh->sps->i_chroma_format_idc ) bs_write_ue( s, sh->weight[0][1].i_denom ); /* chroma_log2_weight_denom */ for( int i = 0; i < sh->i_num_ref_idx_l0_active; i++ ) { int luma_weight_l0_flag = !!sh->weight[i][0].weightfn; bs_write1( s, luma_weight_l0_flag ); if( luma_weight_l0_flag ) { bs_write_se( s, sh->weight[i][0].i_scale ); bs_write_se( s, sh->weight[i][0].i_offset ); } if( sh->sps->i_chroma_format_idc ) { int chroma_weight_l0_flag = sh->weight[i][1].weightfn || sh->weight[i][2].weightfn; bs_write1( s, chroma_weight_l0_flag ); if( chroma_weight_l0_flag ) { for( int j = 1; j < 3; j++ ) { bs_write_se( s, sh->weight[i][j].i_scale ); bs_write_se( s, sh->weight[i][j].i_offset ); } } } } } else if( sh->pps->b_weighted_bipred == 1 && sh->i_type == SLICE_TYPE_B ) { /* TODO */ } if( i_nal_ref_idc != 0 ) { if( sh->i_idr_pic_id >= 0 ) { bs_write1( s, 0 ); /* no output of prior pics flag */ bs_write1( s, 0 ); /* long term reference flag */ } else { bs_write1( s, sh->i_mmco_command_count > 0 ); /* adaptive_ref_pic_marking_mode_flag */ if( sh->i_mmco_command_count > 0 ) { for( int i = 0; i < sh->i_mmco_command_count; i++ ) { bs_write_ue( s, 1 ); /* mark short term ref as unused */ bs_write_ue( s, sh->mmco[i].i_difference_of_pic_nums - 1 ); } bs_write_ue( s, 0 ); /* end command list */ } } } if( sh->pps->b_cabac && sh->i_type != SLICE_TYPE_I ) bs_write_ue( s, sh->i_cabac_init_idc ); bs_write_se( s, sh->i_qp_delta ); /* slice qp delta */ if( sh->pps->b_deblocking_filter_control ) { bs_write_ue( s, sh->i_disable_deblocking_filter_idc ); if( sh->i_disable_deblocking_filter_idc != 1 ) { bs_write_se( s, sh->i_alpha_c0_offset >> 1 ); bs_write_se( s, sh->i_beta_offset >> 1 ); } } } /* If we are within a reasonable distance of the end of the memory allocated for the bitstream, */ /* reallocate, adding an arbitrary amount of space. */ static int bitstream_check_buffer_internal( x264_t *h, int size, int b_cabac, int i_nal ) { if( (b_cabac && (h->cabac.p_end - h->cabac.p < size)) || (h->out.bs.p_end - h->out.bs.p < size) ) { if( size > INT_MAX - h->out.i_bitstream ) return -1; int buf_size = h->out.i_bitstream + size; uint8_t *buf = x264_malloc( buf_size ); if( !buf ) return -1; int aligned_size = h->out.i_bitstream & ~15; h->mc.memcpy_aligned( buf, h->out.p_bitstream, aligned_size ); memcpy( buf + aligned_size, h->out.p_bitstream + aligned_size, h->out.i_bitstream - aligned_size ); intptr_t delta = buf - h->out.p_bitstream; h->out.bs.p_start += delta; h->out.bs.p += delta; h->out.bs.p_end = buf + buf_size; h->cabac.p_start += delta; h->cabac.p += delta; h->cabac.p_end = buf + buf_size; for( int i = 0; i <= i_nal; i++ ) h->out.nal[i].p_payload += delta; x264_free( h->out.p_bitstream ); h->out.p_bitstream = buf; h->out.i_bitstream = buf_size; } return 0; } static int bitstream_check_buffer( x264_t *h ) { int max_row_size = (2500 << SLICE_MBAFF) * h->mb.i_mb_width; return bitstream_check_buffer_internal( h, max_row_size, h->param.b_cabac, h->out.i_nal ); } static int bitstream_check_buffer_filler( x264_t *h, int filler ) { filler += 32; // add padding for safety return bitstream_check_buffer_internal( h, filler, 0, -1 ); } /**************************************************************************** * **************************************************************************** ****************************** External API********************************* **************************************************************************** * ****************************************************************************/ static int validate_parameters( x264_t *h, int b_open ) { if( !h->param.pf_log ) { x264_log_internal( X264_LOG_ERROR, "pf_log not set! did you forget to call x264_param_default?\n" ); return -1; } #if HAVE_MMX if( b_open ) { uint32_t cpuflags = x264_cpu_detect(); int fail = 0; #ifdef __SSE__ if( !(cpuflags & X264_CPU_SSE) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support SSE1, but x264 was compiled with asm\n"); fail = 1; } #else if( !(cpuflags & X264_CPU_MMX2) ) { x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm\n"); fail = 1; } #endif if( fail ) { x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n"); return -1; } } #endif #if HAVE_INTERLACED h->param.b_interlaced = !!PARAM_INTERLACED; #else if( h->param.b_interlaced ) { x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" ); return -1; } #endif #define MAX_RESOLUTION 16384 if( h->param.i_width <= 0 || h->param.i_height <= 0 || h->param.i_width > MAX_RESOLUTION || h->param.i_height > MAX_RESOLUTION ) { x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n", h->param.i_width, h->param.i_height ); return -1; } int i_csp = h->param.i_csp & X264_CSP_MASK; #if X264_CHROMA_FORMAT if( CHROMA_FORMAT != CHROMA_400 && i_csp == X264_CSP_I400 ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:0:0 support\n" ); return -1; } else if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" ); return -1; } else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" ); return -1; } else if( CHROMA_FORMAT != CHROMA_444 && i_csp >= X264_CSP_I444 && i_csp <= X264_CSP_RGB ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:4:4 support\n" ); return -1; } #endif if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) { x264_log( h, X264_LOG_ERROR, "invalid CSP (only I400/I420/YV12/NV12/NV21/I422/YV16/NV16/YUYV/UYVY/" "I444/YV24/BGR/BGRA/RGB supported)\n" ); return -1; } int w_mod = 1; int h_mod = 1 << (PARAM_INTERLACED || h->param.b_fake_interlaced); if( i_csp == X264_CSP_I400 ) { h->param.analyse.i_chroma_qp_offset = 0; h->param.analyse.b_chroma_me = 0; h->param.vui.i_colmatrix = 2; /* undefined */ } else if( i_csp < X264_CSP_I444 ) { w_mod = 2; if( i_csp < X264_CSP_I422 ) h_mod *= 2; } if( h->param.i_width % w_mod ) { x264_log( h, X264_LOG_ERROR, "width not divisible by %d (%dx%d)\n", w_mod, h->param.i_width, h->param.i_height ); return -1; } if( h->param.i_height % h_mod ) { x264_log( h, X264_LOG_ERROR, "height not divisible by %d (%dx%d)\n", h_mod, h->param.i_width, h->param.i_height ); return -1; } if( h->param.crop_rect.i_left < 0 || h->param.crop_rect.i_left >= h->param.i_width || h->param.crop_rect.i_right < 0 || h->param.crop_rect.i_right >= h->param.i_width || h->param.crop_rect.i_top < 0 || h->param.crop_rect.i_top >= h->param.i_height || h->param.crop_rect.i_bottom < 0 || h->param.crop_rect.i_bottom >= h->param.i_height || h->param.crop_rect.i_left + h->param.crop_rect.i_right >= h->param.i_width || h->param.crop_rect.i_top + h->param.crop_rect.i_bottom >= h->param.i_height ) { x264_log( h, X264_LOG_ERROR, "invalid crop-rect %d,%d,%d,%d\n", h->param.crop_rect.i_left, h->param.crop_rect.i_top, h->param.crop_rect.i_right, h->param.crop_rect.i_bottom ); return -1; } if( h->param.crop_rect.i_left % w_mod || h->param.crop_rect.i_right % w_mod || h->param.crop_rect.i_top % h_mod || h->param.crop_rect.i_bottom % h_mod ) { x264_log( h, X264_LOG_ERROR, "crop-rect %d,%d,%d,%d not divisible by %dx%d\n", h->param.crop_rect.i_left, h->param.crop_rect.i_top, h->param.crop_rect.i_right, h->param.crop_rect.i_bottom, w_mod, h_mod ); return -1; } if( h->param.vui.i_sar_width <= 0 || h->param.vui.i_sar_height <= 0 ) { h->param.vui.i_sar_width = 0; h->param.vui.i_sar_height = 0; } if( h->param.i_threads == X264_THREADS_AUTO ) { h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2; /* Avoid too many threads as they don't improve performance and * complicate VBV. Capped at an arbitrary 2 rows per thread. */ int max_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 2 ); h->param.i_threads = X264_MIN( h->param.i_threads, max_threads ); } int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 ); if( h->param.i_threads > 1 ) { #if !HAVE_THREAD x264_log( h, X264_LOG_WARNING, "not compiled with thread support!\n"); h->param.i_threads = 1; #endif /* Avoid absurdly small thread slices as they can reduce performance * and VBV compliance. Capped at an arbitrary 4 rows per thread. */ if( h->param.b_sliced_threads ) h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads ); } h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); if( h->param.i_threads == 1 ) { h->param.b_sliced_threads = 0; h->param.i_lookahead_threads = 1; } h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads; if( h->i_thread_frames > 1 ) h->param.nalu_process = NULL; if( h->param.b_opencl ) { #if !HAVE_OPENCL x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" ); h->param.b_opencl = 0; #elif BIT_DEPTH > 8 x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" ); h->param.b_opencl = 0; #else if( h->param.i_width < 32 || h->param.i_height < 32 ) { x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" ); h->param.b_opencl = 0; } #endif if( h->param.opencl_device_id && h->param.i_opencl_device ) { x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" ); h->param.i_opencl_device = 0; } } h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE ); if( h->param.i_keyint_max == 1 ) { h->param.b_intra_refresh = 0; h->param.analyse.i_weighted_pred = 0; h->param.i_frame_reference = 1; h->param.i_dpb_size = 1; } if( h->param.i_frame_packing < -1 || h->param.i_frame_packing > 7 ) { x264_log( h, X264_LOG_WARNING, "ignoring unknown frame packing value\n" ); h->param.i_frame_packing = -1; } if( h->param.i_frame_packing == 7 && ((h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right) % 3 || (h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom) % 3) ) { x264_log( h, X264_LOG_ERROR, "cropped resolution %dx%d not compatible with tile format frame packing\n", h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right, h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom ); return -1; } if( h->param.mastering_display.b_mastering_display ) { if( h->param.mastering_display.i_green_x > UINT16_MAX || h->param.mastering_display.i_green_x < 0 || h->param.mastering_display.i_green_y > UINT16_MAX || h->param.mastering_display.i_green_y < 0 || h->param.mastering_display.i_blue_x > UINT16_MAX || h->param.mastering_display.i_blue_x < 0 || h->param.mastering_display.i_blue_y > UINT16_MAX || h->param.mastering_display.i_blue_y < 0 || h->param.mastering_display.i_red_x > UINT16_MAX || h->param.mastering_display.i_red_x < 0 || h->param.mastering_display.i_red_y > UINT16_MAX || h->param.mastering_display.i_red_y < 0 || h->param.mastering_display.i_white_x > UINT16_MAX || h->param.mastering_display.i_white_x < 0 || h->param.mastering_display.i_white_y > UINT16_MAX || h->param.mastering_display.i_white_y < 0 ) { x264_log( h, X264_LOG_ERROR, "mastering display xy coordinates out of range [0,%u]\n", UINT16_MAX ); return -1; } if( h->param.mastering_display.i_display_max > UINT32_MAX || h->param.mastering_display.i_display_max < 0 || h->param.mastering_display.i_display_min > UINT32_MAX || h->param.mastering_display.i_display_min < 0 ) { x264_log( h, X264_LOG_ERROR, "mastering display brightness out of range [0,%u]\n", UINT32_MAX ); return -1; } if( h->param.mastering_display.i_display_min == 50000 && h->param.mastering_display.i_display_max == 50000 ) { x264_log( h, X264_LOG_ERROR, "mastering display min and max brightness cannot both be 50000\n" ); return -1; } } if( h->param.content_light_level.b_cll && (h->param.content_light_level.i_max_cll > UINT16_MAX || h->param.content_light_level.i_max_cll < 0 || h->param.content_light_level.i_max_fall > UINT16_MAX || h->param.content_light_level.i_max_fall < 0) ) { x264_log( h, X264_LOG_ERROR, "content light levels out of range [0,%u]\n", UINT16_MAX ); return -1; } /* Detect default ffmpeg settings and terminate with an error. */ if( b_open ) { int score = 0; score += h->param.analyse.i_me_range == 0; score += h->param.rc.i_qp_step == 3; score += h->param.i_keyint_max == 12; score += h->param.rc.i_qp_min == 2; score += h->param.rc.i_qp_max == 31; score += h->param.rc.f_qcompress == 0.5; score += fabs(h->param.rc.f_ip_factor - 1.25) < 0.01; score += fabs(h->param.rc.f_pb_factor - 1.25) < 0.01; score += h->param.analyse.inter == 0 && h->param.analyse.i_subpel_refine == 8; if( score >= 5 ) { x264_log( h, X264_LOG_ERROR, "broken ffmpeg default settings detected\n" ); x264_log( h, X264_LOG_ERROR, "use an encoding preset (e.g. -vpre medium)\n" ); x264_log( h, X264_LOG_ERROR, "preset usage: -vpre -vpre \n" ); x264_log( h, X264_LOG_ERROR, "speed presets are listed in x264 --help\n" ); x264_log( h, X264_LOG_ERROR, "profile is optional; x264 defaults to high\n" ); return -1; } } if( h->param.rc.i_rc_method < 0 || h->param.rc.i_rc_method > 2 ) { x264_log( h, X264_LOG_ERROR, "no ratecontrol method specified\n" ); return -1; } if( PARAM_INTERLACED ) h->param.b_pic_struct = 1; if( h->param.i_avcintra_class ) { if( BIT_DEPTH != 10 ) { x264_log( h, X264_LOG_ERROR, "%2d-bit AVC-Intra is not widely compatible\n", BIT_DEPTH ); x264_log( h, X264_LOG_ERROR, "10-bit x264 is required to encode AVC-Intra\n" ); return -1; } int type = h->param.i_avcintra_class == 480 ? 4 : h->param.i_avcintra_class == 300 ? 3 : h->param.i_avcintra_class == 200 ? 2 : h->param.i_avcintra_class == 100 ? 1 : h->param.i_avcintra_class == 50 ? 0 : -1; if( type < 0 ) { x264_log( h, X264_LOG_ERROR, "Invalid AVC-Intra class\n" ); return -1; } else if( type > 2 && h->param.i_avcintra_flavor != X264_AVCINTRA_FLAVOR_SONY ) { x264_log( h, X264_LOG_ERROR, "AVC-Intra %d only supported by Sony XAVC flavor\n", h->param.i_avcintra_class ); return -1; } /* [50/100/200/300/480][res][fps] */ static const struct { uint16_t fps_num; uint16_t fps_den; uint8_t interlaced; uint16_t frame_size; const uint8_t *cqm_4iy; const uint8_t *cqm_4ic; const uint8_t *cqm_8iy; } avcintra_lut[5][2][7] = { {{{ 60000, 1001, 0, 912, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 50, 1, 0, 1100, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 30000, 1001, 0, 912, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 25, 1, 0, 1100, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 24000, 1001, 0, 912, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }}, {{ 30000, 1001, 1, 1820, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_1080i_8iy }, { 25, 1, 1, 2196, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_1080i_8iy }, { 60000, 1001, 0, 1820, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 30000, 1001, 0, 1820, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 50, 1, 0, 2196, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 25, 1, 0, 2196, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }, { 24000, 1001, 0, 1820, x264_cqm_jvt4i, x264_cqm_avci50_4ic, x264_cqm_avci50_p_8iy }}}, {{{ 60000, 1001, 0, 1848, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }, { 50, 1, 0, 2224, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }, { 30000, 1001, 0, 1848, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }, { 25, 1, 0, 2224, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }, { 24000, 1001, 0, 1848, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }}, {{ 30000, 1001, 1, 3692, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy }, { 25, 1, 1, 4444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy }, { 60000, 1001, 0, 3692, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 30000, 1001, 0, 3692, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 50, 1, 0, 4444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 25, 1, 0, 4444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 24000, 1001, 0, 3692, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }}}, {{{ 60000, 1001, 0, 3724, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }, { 50, 1, 0, 4472, x264_cqm_jvt4i, x264_cqm_avci100_720p_4ic, x264_cqm_avci100_720p_8iy }}, {{ 30000, 1001, 1, 7444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy }, { 25, 1, 1, 8940, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080i_8iy }, { 60000, 1001, 0, 7444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 30000, 1001, 0, 7444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 50, 1, 0, 8940, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 25, 1, 0, 8940, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }, { 24000, 1001, 0, 7444, x264_cqm_jvt4i, x264_cqm_avci100_1080_4ic, x264_cqm_avci100_1080p_8iy }}}, {{{ 60000, 1001, 0, 9844, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 50, 1, 0, 9844, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 30000, 1001, 0, 9844, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 25, 1, 0, 9844, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 24000, 1001, 0, 9844, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }}}, {{{ 60000, 1001, 0, 15700, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 50, 1, 0, 15700, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 30000, 1001, 0, 15700, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 25, 1, 0, 15700, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }, { 24000, 1001, 0, 15700, x264_cqm_avci300_2160p_4iy, x264_cqm_avci300_2160p_4ic, x264_cqm_avci300_2160p_8iy }}} }; int res = -1; if( i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 && !type ) { if( h->param.i_width == 1440 && h->param.i_height == 1080 ) res = 1; else if( h->param.i_width == 960 && h->param.i_height == 720 ) res = 0; } else if( i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 && type ) { if( type < 3 ) { if( h->param.i_width == 1920 && h->param.i_height == 1080 ) res = 1; else if( h->param.i_width == 2048 && h->param.i_height == 1080 ) res = 1; else if( h->param.i_width == 1280 && h->param.i_height == 720 ) res = 0; } else { if( h->param.i_width == 3840 && h->param.i_height == 2160 ) res = 0; else if( h->param.i_width == 4096 && h->param.i_height == 2160 ) res = 0; } } else { x264_log( h, X264_LOG_ERROR, "Invalid colorspace for AVC-Intra %d\n", h->param.i_avcintra_class ); return -1; } if( res < 0 ) { x264_log( h, X264_LOG_ERROR, "Resolution %dx%d invalid for AVC-Intra %d\n", h->param.i_width, h->param.i_height, h->param.i_avcintra_class ); return -1; } if( h->param.nalu_process ) { x264_log( h, X264_LOG_ERROR, "nalu_process is not supported in AVC-Intra mode\n" ); return -1; } if( !h->param.b_repeat_headers ) { x264_log( h, X264_LOG_ERROR, "Separate headers not supported in AVC-Intra mode\n" ); return -1; } int i; uint32_t fps_num = h->param.i_fps_num, fps_den = h->param.i_fps_den; x264_reduce_fraction( &fps_num, &fps_den ); for( i = 0; i < 7; i++ ) { if( avcintra_lut[type][res][i].fps_num == fps_num && avcintra_lut[type][res][i].fps_den == fps_den && avcintra_lut[type][res][i].interlaced == PARAM_INTERLACED ) { break; } } if( i == 7 ) { x264_log( h, X264_LOG_ERROR, "FPS %d/%d%c not compatible with AVC-Intra %d\n", h->param.i_fps_num, h->param.i_fps_den, PARAM_INTERLACED ? 'i' : 'p', h->param.i_avcintra_class ); return -1; } h->param.i_keyint_max = 1; h->param.b_intra_refresh = 0; h->param.analyse.i_weighted_pred = 0; h->param.i_frame_reference = 1; h->param.i_dpb_size = 1; h->param.b_bluray_compat = 0; h->param.b_vfr_input = 0; h->param.b_aud = 1; h->param.vui.i_chroma_loc = 0; h->param.i_nal_hrd = X264_NAL_HRD_NONE; h->param.b_deblocking_filter = 0; h->param.b_stitchable = 1; h->param.b_pic_struct = 0; h->param.analyse.b_transform_8x8 = 1; h->param.analyse.intra = X264_ANALYSE_I8x8; h->param.analyse.i_chroma_qp_offset = type > 2 ? -4 : res && type ? 3 : 4; h->param.b_cabac = !type; h->param.rc.i_vbv_buffer_size = avcintra_lut[type][res][i].frame_size; h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate = h->param.rc.i_vbv_buffer_size * fps_num / fps_den; h->param.rc.i_rc_method = X264_RC_ABR; h->param.rc.f_vbv_buffer_init = 1.0; h->param.rc.b_filler = 1; h->param.i_cqm_preset = X264_CQM_CUSTOM; memcpy( h->param.cqm_4iy, avcintra_lut[type][res][i].cqm_4iy, sizeof(h->param.cqm_4iy) ); memcpy( h->param.cqm_4ic, avcintra_lut[type][res][i].cqm_4ic, sizeof(h->param.cqm_4ic) ); memcpy( h->param.cqm_8iy, avcintra_lut[type][res][i].cqm_8iy, sizeof(h->param.cqm_8iy) ); /* Sony XAVC flavor much more simple */ if( h->param.i_avcintra_flavor == X264_AVCINTRA_FLAVOR_SONY ) { h->param.i_slice_count = 8; if( h->param.b_sliced_threads ) h->param.i_threads = h->param.i_slice_count; /* Sony XAVC unlike AVC-Intra doesn't seem to have a QP floor */ } else { /* Need exactly 10 slices of equal MB count... why? $deity knows... */ h->param.i_slice_max_mbs = ((h->param.i_width + 15) / 16) * ((h->param.i_height + 15) / 16) / 10; h->param.i_slice_max_size = 0; /* The slice structure only allows a maximum of 2 threads for 1080i/p * and 1 or 5 threads for 720p */ if( h->param.b_sliced_threads ) { if( res ) h->param.i_threads = X264_MIN( 2, h->param.i_threads ); else { h->param.i_threads = X264_MIN( 5, h->param.i_threads ); if( h->param.i_threads < 5 ) h->param.i_threads = 1; } } /* Official encoder doesn't appear to go under 13 * and Avid cannot handle negative QPs */ h->param.rc.i_qp_min = X264_MAX( h->param.rc.i_qp_min, QP_BD_OFFSET + 1 ); } if( type ) h->param.vui.i_sar_width = h->param.vui.i_sar_height = 1; else { h->param.vui.i_sar_width = 4; h->param.vui.i_sar_height = 3; } } h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, -QP_BD_OFFSET, 51 ); h->param.rc.f_rf_constant_max = x264_clip3f( h->param.rc.f_rf_constant_max, -QP_BD_OFFSET, 51 ); h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, -1, QP_MAX ); h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 0, 11 ); h->param.rc.f_ip_factor = x264_clip3f( h->param.rc.f_ip_factor, 0.01, 10.0 ); h->param.rc.f_pb_factor = x264_clip3f( h->param.rc.f_pb_factor, 0.01, 10.0 ); if( h->param.rc.i_rc_method == X264_RC_CRF ) { h->param.rc.i_qp_constant = h->param.rc.f_rf_constant + QP_BD_OFFSET; h->param.rc.i_bitrate = 0; } if( b_open && (h->param.rc.i_rc_method == X264_RC_CQP || h->param.rc.i_rc_method == X264_RC_CRF) && h->param.rc.i_qp_constant == 0 ) { h->mb.b_lossless = 1; h->param.i_cqm_preset = X264_CQM_FLAT; h->param.psz_cqm_file = NULL; h->param.rc.i_rc_method = X264_RC_CQP; h->param.rc.f_ip_factor = 1; h->param.rc.f_pb_factor = 1; h->param.analyse.b_psnr = 0; h->param.analyse.b_ssim = 0; h->param.analyse.i_chroma_qp_offset = 0; h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; h->param.analyse.b_psy = 0; h->param.i_bframe = 0; /* 8x8dct is not useful without RD in CAVLC lossless */ if( !h->param.b_cabac && h->param.analyse.i_subpel_refine < 6 ) h->param.analyse.b_transform_8x8 = 0; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { float qp_p = h->param.rc.i_qp_constant; float qp_i = qp_p - 6*log2f( h->param.rc.f_ip_factor ); float qp_b = qp_p + 6*log2f( h->param.rc.f_pb_factor ); if( qp_p < 0 ) { x264_log( h, X264_LOG_ERROR, "qp not specified\n" ); return -1; } h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, QP_MAX ); h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX ); h->param.rc.i_aq_mode = 0; h->param.rc.b_mb_tree = 0; h->param.rc.i_bitrate = 0; } h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX ); h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max ); h->param.rc.i_qp_step = x264_clip3( h->param.rc.i_qp_step, 2, QP_MAX ); h->param.rc.i_bitrate = x264_clip3( h->param.rc.i_bitrate, 0, 2000000 ); if( h->param.rc.i_rc_method == X264_RC_ABR && !h->param.rc.i_bitrate ) { x264_log( h, X264_LOG_ERROR, "bitrate not specified\n" ); return -1; } h->param.rc.i_vbv_buffer_size = x264_clip3( h->param.rc.i_vbv_buffer_size, 0, 2000000 ); h->param.rc.i_vbv_max_bitrate = x264_clip3( h->param.rc.i_vbv_max_bitrate, 0, 2000000 ); h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init, 0, 2000000 ); if( h->param.rc.i_vbv_buffer_size ) { if( h->param.rc.i_rc_method == X264_RC_CQP ) { x264_log( h, X264_LOG_WARNING, "VBV is incompatible with constant QP, ignored.\n" ); h->param.rc.i_vbv_max_bitrate = 0; h->param.rc.i_vbv_buffer_size = 0; } else if( h->param.rc.i_vbv_max_bitrate == 0 ) { if( h->param.rc.i_rc_method == X264_RC_ABR ) { x264_log( h, X264_LOG_WARNING, "VBV maxrate unspecified, assuming CBR\n" ); h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; } else { x264_log( h, X264_LOG_WARNING, "VBV bufsize set but maxrate unspecified, ignored\n" ); h->param.rc.i_vbv_buffer_size = 0; } } else if( h->param.rc.i_vbv_max_bitrate < h->param.rc.i_bitrate && h->param.rc.i_rc_method == X264_RC_ABR ) { x264_log( h, X264_LOG_WARNING, "max bitrate less than average bitrate, assuming CBR\n" ); h->param.rc.i_bitrate = h->param.rc.i_vbv_max_bitrate; } } else if( h->param.rc.i_vbv_max_bitrate ) { x264_log( h, X264_LOG_WARNING, "VBV maxrate specified, but no bufsize, ignored\n" ); h->param.rc.i_vbv_max_bitrate = 0; } h->param.i_slice_max_size = X264_MAX( h->param.i_slice_max_size, 0 ); h->param.i_slice_max_mbs = X264_MAX( h->param.i_slice_max_mbs, 0 ); h->param.i_slice_min_mbs = X264_MAX( h->param.i_slice_min_mbs, 0 ); if( h->param.i_slice_max_mbs ) h->param.i_slice_min_mbs = X264_MIN( h->param.i_slice_min_mbs, h->param.i_slice_max_mbs/2 ); else if( !h->param.i_slice_max_size ) h->param.i_slice_min_mbs = 0; if( PARAM_INTERLACED && h->param.i_slice_min_mbs ) { x264_log( h, X264_LOG_WARNING, "interlace + slice-min-mbs is not implemented\n" ); h->param.i_slice_min_mbs = 0; } int mb_width = (h->param.i_width+15)/16; if( h->param.i_slice_min_mbs > mb_width ) { x264_log( h, X264_LOG_WARNING, "slice-min-mbs > row mb size (%d) not implemented\n", mb_width ); h->param.i_slice_min_mbs = mb_width; } int max_slices = (h->param.i_height+((16<param.b_sliced_threads ) h->param.i_slice_count = x264_clip3( h->param.i_threads, 0, max_slices ); else { h->param.i_slice_count = x264_clip3( h->param.i_slice_count, 0, max_slices ); if( h->param.i_slice_max_mbs || h->param.i_slice_max_size ) h->param.i_slice_count = 0; } if( h->param.i_slice_count_max > 0 ) h->param.i_slice_count_max = X264_MAX( h->param.i_slice_count, h->param.i_slice_count_max ); if( h->param.b_bluray_compat ) { h->param.i_bframe_pyramid = X264_MIN( X264_B_PYRAMID_STRICT, h->param.i_bframe_pyramid ); h->param.i_bframe = X264_MIN( h->param.i_bframe, 3 ); h->param.b_aud = 1; h->param.i_nal_hrd = X264_MAX( h->param.i_nal_hrd, X264_NAL_HRD_VBR ); h->param.i_slice_max_size = 0; h->param.i_slice_max_mbs = 0; h->param.b_intra_refresh = 0; h->param.i_frame_reference = X264_MIN( h->param.i_frame_reference, 6 ); h->param.i_dpb_size = X264_MIN( h->param.i_dpb_size, 6 ); /* Don't use I-frames, because Blu-ray treats them the same as IDR. */ h->param.i_keyint_min = 1; /* Due to the proliferation of broken players that don't handle dupes properly. */ h->param.analyse.i_weighted_pred = X264_MIN( h->param.analyse.i_weighted_pred, X264_WEIGHTP_SIMPLE ); if( h->param.b_fake_interlaced ) h->param.b_pic_struct = 1; } h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, X264_REF_MAX ); h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, X264_REF_MAX ); if( h->param.i_scenecut_threshold < 0 ) h->param.i_scenecut_threshold = 0; h->param.analyse.i_direct_mv_pred = x264_clip3( h->param.analyse.i_direct_mv_pred, X264_DIRECT_PRED_NONE, X264_DIRECT_PRED_AUTO ); if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL ) { x264_log( h, X264_LOG_WARNING, "subme=0 + direct=temporal is not supported\n" ); h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; } h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) ); h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 ); if( h->param.i_bframe <= 1 ) h->param.i_bframe_pyramid = X264_B_PYRAMID_NONE; h->param.i_bframe_pyramid = x264_clip3( h->param.i_bframe_pyramid, X264_B_PYRAMID_NONE, X264_B_PYRAMID_NORMAL ); h->param.i_bframe_adaptive = x264_clip3( h->param.i_bframe_adaptive, X264_B_ADAPT_NONE, X264_B_ADAPT_TRELLIS ); if( !h->param.i_bframe ) { h->param.i_bframe_adaptive = X264_B_ADAPT_NONE; h->param.analyse.i_direct_mv_pred = 0; h->param.analyse.b_weighted_bipred = 0; h->param.b_open_gop = 0; } if( h->param.b_intra_refresh && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL ) { x264_log( h, X264_LOG_WARNING, "b-pyramid normal + intra-refresh is not supported\n" ); h->param.i_bframe_pyramid = X264_B_PYRAMID_STRICT; } if( h->param.b_intra_refresh && (h->param.i_frame_reference > 1 || h->param.i_dpb_size > 1) ) { x264_log( h, X264_LOG_WARNING, "ref > 1 + intra-refresh is not supported\n" ); h->param.i_frame_reference = 1; h->param.i_dpb_size = 1; } if( h->param.b_intra_refresh && h->param.b_open_gop ) { x264_log( h, X264_LOG_WARNING, "intra-refresh is not compatible with open-gop\n" ); h->param.b_open_gop = 0; } if( !h->param.i_fps_num || !h->param.i_fps_den ) { h->param.i_fps_num = 25; h->param.i_fps_den = 1; } float fps = (float)h->param.i_fps_num / h->param.i_fps_den; if( h->param.i_keyint_min == X264_KEYINT_MIN_AUTO ) h->param.i_keyint_min = X264_MIN( h->param.i_keyint_max / 10, (int)fps ); h->param.i_keyint_min = x264_clip3( h->param.i_keyint_min, 1, h->param.i_keyint_max/2+1 ); h->param.rc.i_lookahead = x264_clip3( h->param.rc.i_lookahead, 0, X264_LOOKAHEAD_MAX ); { int maxrate = X264_MAX( h->param.rc.i_vbv_max_bitrate, h->param.rc.i_bitrate ); float bufsize = maxrate ? (float)h->param.rc.i_vbv_buffer_size / maxrate : 0; h->param.rc.i_lookahead = X264_MIN( h->param.rc.i_lookahead, X264_MAX( h->param.i_keyint_max, bufsize*fps ) ); } if( !h->param.i_timebase_num || !h->param.i_timebase_den || !(h->param.b_vfr_input || h->param.b_pulldown) ) { h->param.i_timebase_num = h->param.i_fps_den; h->param.i_timebase_den = h->param.i_fps_num; } h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 ); if( h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 ) h->param.rc.b_mb_tree = 0; if( (!h->param.b_intra_refresh && h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE) && !h->param.rc.i_lookahead && h->param.rc.b_mb_tree ) { x264_log( h, X264_LOG_WARNING, "lookaheadless mb-tree requires intra refresh or infinite keyint\n" ); h->param.rc.b_mb_tree = 0; } if( b_open && h->param.rc.b_stat_read ) h->param.rc.i_lookahead = 0; #if HAVE_THREAD if( h->param.i_sync_lookahead < 0 ) h->param.i_sync_lookahead = h->param.i_bframe + 1; h->param.i_sync_lookahead = X264_MIN( h->param.i_sync_lookahead, X264_LOOKAHEAD_MAX ); if( h->param.rc.b_stat_read || h->i_thread_frames == 1 ) h->param.i_sync_lookahead = 0; #else h->param.i_sync_lookahead = 0; #endif h->param.i_deblocking_filter_alphac0 = x264_clip3( h->param.i_deblocking_filter_alphac0, -6, 6 ); h->param.i_deblocking_filter_beta = x264_clip3( h->param.i_deblocking_filter_beta, -6, 6 ); h->param.analyse.i_luma_deadzone[0] = x264_clip3( h->param.analyse.i_luma_deadzone[0], 0, 32 ); h->param.analyse.i_luma_deadzone[1] = x264_clip3( h->param.analyse.i_luma_deadzone[1], 0, 32 ); h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, 0, 2 ); if( h->param.i_cqm_preset < X264_CQM_FLAT || h->param.i_cqm_preset > X264_CQM_CUSTOM ) h->param.i_cqm_preset = X264_CQM_FLAT; if( h->param.analyse.i_me_method < X264_ME_DIA || h->param.analyse.i_me_method > X264_ME_TESA ) h->param.analyse.i_me_method = X264_ME_HEX; h->param.analyse.i_me_range = x264_clip3( h->param.analyse.i_me_range, 4, 1024 ); if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX ) h->param.analyse.i_me_range = 16; if( h->param.analyse.i_me_method == X264_ME_TESA && (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) ) h->param.analyse.i_me_method = X264_ME_ESA; h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1; h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16| X264_ANALYSE_I4x4|X264_ANALYSE_I8x8; h->param.analyse.intra &= X264_ANALYSE_I4x4|X264_ANALYSE_I8x8; if( !(h->param.analyse.inter & X264_ANALYSE_PSUB16x16) ) h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8; if( !h->param.analyse.b_transform_8x8 ) { h->param.analyse.inter &= ~X264_ANALYSE_I8x8; h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 3 ); h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 ); if( h->param.rc.f_aq_strength == 0 ) h->param.rc.i_aq_mode = 0; if( h->param.i_log_level < X264_LOG_INFO ) { h->param.analyse.b_psnr = 0; h->param.analyse.b_ssim = 0; } /* Warn users trying to measure PSNR/SSIM with psy opts on. */ if( b_open && (h->param.analyse.b_psnr || h->param.analyse.b_ssim) ) { char *s = NULL; if( h->param.analyse.b_psy ) { s = h->param.analyse.b_psnr ? "psnr" : "ssim"; x264_log( h, X264_LOG_WARNING, "--%s used with psy on: results will be invalid!\n", s ); } else if( !h->param.rc.i_aq_mode && h->param.analyse.b_ssim ) { x264_log( h, X264_LOG_WARNING, "--ssim used with AQ off: results will be invalid!\n" ); s = "ssim"; } else if( h->param.rc.i_aq_mode && h->param.analyse.b_psnr ) { x264_log( h, X264_LOG_WARNING, "--psnr used with AQ on: results will be invalid!\n" ); s = "psnr"; } if( s ) x264_log( h, X264_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s ); } if( !h->param.analyse.b_psy ) { h->param.analyse.f_psy_rd = 0; h->param.analyse.f_psy_trellis = 0; } h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 ); h->mb.i_psy_rd = h->param.analyse.i_subpel_refine >= 6 ? FIX8( h->param.analyse.f_psy_rd ) : 0; h->mb.i_psy_trellis = h->param.analyse.i_trellis ? FIX8( h->param.analyse.f_psy_trellis / 4 ) : 0; h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -32, 32); /* In 4:4:4 mode, chroma gets twice as much resolution, so we can halve its quality. */ if( b_open && i_csp >= X264_CSP_I444 && i_csp < X264_CSP_BGR && h->param.analyse.b_psy ) h->param.analyse.i_chroma_qp_offset += 6; /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */ /* so we lower the chroma QP offset to compensate */ if( b_open && h->mb.i_psy_rd && !h->param.i_avcintra_class ) h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2; /* Psy trellis has a similar effect. */ if( b_open && h->mb.i_psy_trellis && !h->param.i_avcintra_class ) h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2; h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); /* MB-tree requires AQ to be on, even if the strength is zero. */ if( !h->param.rc.i_aq_mode && h->param.rc.b_mb_tree ) { h->param.rc.i_aq_mode = 1; h->param.rc.f_aq_strength = 0; } h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 ); if( h->param.analyse.i_subpel_refine >= 10 && (h->param.analyse.i_trellis != 2 || !h->param.rc.i_aq_mode) ) h->param.analyse.i_subpel_refine = 9; if( b_open ) { const x264_level_t *l = x264_levels; if( h->param.i_level_idc < 0 ) { int maxrate_bak = h->param.rc.i_vbv_max_bitrate; if( h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.i_vbv_buffer_size <= 0 ) h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate * 2; x264_sps_init( h->sps, h->param.i_sps_id, &h->param ); do h->param.i_level_idc = l->level_idc; while( l[1].level_idc && x264_validate_levels( h, 0 ) && l++ ); h->param.rc.i_vbv_max_bitrate = maxrate_bak; } else { while( l->level_idc && l->level_idc != h->param.i_level_idc ) l++; if( l->level_idc == 0 ) { x264_log( h, X264_LOG_ERROR, "invalid level_idc: %d\n", h->param.i_level_idc ); return -1; } } if( h->param.analyse.i_mv_range <= 0 ) h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED; else h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 8192 >> PARAM_INTERLACED); } h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) { if( h->param.b_sliced_threads ) h->param.i_lookahead_threads = h->param.i_threads; else { /* If we're using much slower lookahead settings than encoding settings, it helps a lot to use * more lookahead threads. This typically happens in the first pass of a two-pass encode, so * try to guess at this sort of case. * * Tuned by a little bit of real encoding with the various presets. */ int badapt = h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS; int subme = X264_MIN( h->param.analyse.i_subpel_refine / 3, 3 ) + (h->param.analyse.i_subpel_refine > 1); int bframes = X264_MIN( (h->param.i_bframe - 1) / 3, 3 ); /* [b-adapt 0/1 vs 2][quantized subme][quantized bframes] */ static const uint8_t lookahead_thread_div[2][5][4] = {{{6,6,6,6}, {3,3,3,3}, {4,4,4,4}, {6,6,6,6}, {12,12,12,12}}, {{3,2,1,1}, {2,1,1,1}, {4,3,2,1}, {6,4,3,2}, {12, 9, 6, 4}}}; h->param.i_lookahead_threads = h->param.i_threads / lookahead_thread_div[badapt][subme][bframes]; /* Since too many lookahead threads significantly degrades lookahead accuracy, limit auto * lookahead threads to about 8 macroblock rows high each at worst. This number is chosen * pretty much arbitrarily. */ h->param.i_lookahead_threads = X264_MIN( h->param.i_lookahead_threads, h->param.i_height / 128 ); } } h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); if( PARAM_INTERLACED ) { if( h->param.analyse.i_me_method >= X264_ME_ESA ) { x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" ); h->param.analyse.i_me_method = X264_ME_UMH; } if( h->param.analyse.i_weighted_pred > 0 ) { x264_log( h, X264_LOG_WARNING, "interlace + weightp is not implemented\n" ); h->param.analyse.i_weighted_pred = X264_WEIGHTP_NONE; } } if( !h->param.analyse.i_weighted_pred && h->param.rc.b_mb_tree && h->param.analyse.b_psy ) h->param.analyse.i_weighted_pred = X264_WEIGHTP_FAKE; if( h->i_thread_frames > 1 ) { int r = h->param.analyse.i_mv_range_thread; int r2; if( r <= 0 ) { // half of the available space is reserved and divided evenly among the threads, // the rest is allocated to whichever thread is far enough ahead to use it. // reserving more space increases quality for some videos, but costs more time // in thread synchronization. int max_range = (h->param.i_height + X264_THREAD_HEIGHT) / h->i_thread_frames - X264_THREAD_HEIGHT; r = max_range / 2; } r = X264_MAX( r, h->param.analyse.i_me_range ); r = X264_MIN( r, h->param.analyse.i_mv_range ); // round up to use the whole mb row r2 = (r & ~15) + ((-X264_THREAD_HEIGHT) & 15); if( r2 < r ) r2 += 16; x264_log( h, X264_LOG_DEBUG, "using mv_range_thread = %d\n", r2 ); h->param.analyse.i_mv_range_thread = r2; } if( h->param.rc.f_rate_tolerance < 0 ) h->param.rc.f_rate_tolerance = 0; if( h->param.rc.f_qblur < 0 ) h->param.rc.f_qblur = 0; if( h->param.rc.f_complexity_blur < 0 ) h->param.rc.f_complexity_blur = 0; h->param.i_sps_id &= 31; h->param.i_nal_hrd = x264_clip3( h->param.i_nal_hrd, X264_NAL_HRD_NONE, X264_NAL_HRD_CBR ); if( h->param.i_nal_hrd && !h->param.rc.i_vbv_buffer_size ) { x264_log( h, X264_LOG_WARNING, "NAL HRD parameters require VBV parameters\n" ); h->param.i_nal_hrd = X264_NAL_HRD_NONE; } if( h->param.i_nal_hrd == X264_NAL_HRD_CBR && (h->param.rc.i_bitrate != h->param.rc.i_vbv_max_bitrate || !h->param.rc.i_vbv_max_bitrate) ) { x264_log( h, X264_LOG_WARNING, "CBR HRD requires constant bitrate\n" ); h->param.i_nal_hrd = X264_NAL_HRD_VBR; } if( h->param.i_nal_hrd == X264_NAL_HRD_CBR ) h->param.rc.b_filler = 1; /* ensure the booleans are 0 or 1 so they can be used in math */ #define BOOLIFY(x) h->param.x = !!h->param.x BOOLIFY( b_cabac ); BOOLIFY( b_constrained_intra ); BOOLIFY( b_deblocking_filter ); BOOLIFY( b_deterministic ); BOOLIFY( b_sliced_threads ); BOOLIFY( b_interlaced ); BOOLIFY( b_intra_refresh ); BOOLIFY( b_aud ); BOOLIFY( b_repeat_headers ); BOOLIFY( b_annexb ); BOOLIFY( b_vfr_input ); BOOLIFY( b_pulldown ); BOOLIFY( b_tff ); BOOLIFY( b_pic_struct ); BOOLIFY( b_fake_interlaced ); BOOLIFY( b_open_gop ); BOOLIFY( b_bluray_compat ); BOOLIFY( b_stitchable ); BOOLIFY( b_full_recon ); BOOLIFY( b_opencl ); BOOLIFY( analyse.b_transform_8x8 ); BOOLIFY( analyse.b_weighted_bipred ); BOOLIFY( analyse.b_chroma_me ); BOOLIFY( analyse.b_mixed_references ); BOOLIFY( analyse.b_fast_pskip ); BOOLIFY( analyse.b_dct_decimate ); BOOLIFY( analyse.b_psy ); BOOLIFY( analyse.b_psnr ); BOOLIFY( analyse.b_ssim ); BOOLIFY( rc.b_stat_write ); BOOLIFY( rc.b_stat_read ); BOOLIFY( rc.b_mb_tree ); BOOLIFY( rc.b_filler ); #undef BOOLIFY return 0; } static void mbcmp_init( x264_t *h ) { int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c; h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c; h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8; h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4; h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL : satd ? h->pixf.intra_satd_x9_4x4 : h->pixf.intra_sad_x9_4x4; h->pixf.intra_mbcmp_x9_8x8 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL : satd ? h->pixf.intra_sa8d_x9_8x8 : h->pixf.intra_sad_x9_8x8; satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) ); } static void chroma_dsp_init( x264_t *h ) { memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) ); switch( CHROMA_FORMAT ) { case CHROMA_400: h->mc.prefetch_fenc = h->mc.prefetch_fenc_400; break; case CHROMA_420: memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) ); h->mc.prefetch_fenc = h->mc.prefetch_fenc_420; h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420; h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra; h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff; h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff; h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c; h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4; h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4; break; case CHROMA_422: memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) ); h->mc.prefetch_fenc = h->mc.prefetch_fenc_422; h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422; h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra; h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff; h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff; h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c; h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8; h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8; break; case CHROMA_444: h->mc.prefetch_fenc = h->mc.prefetch_fenc_422; /* FIXME: doesn't cover V plane */ h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff; h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff; break; } } static void set_aspect_ratio( x264_t *h, x264_param_t *param, int initial ) { /* VUI */ if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 ) { uint32_t i_w = param->vui.i_sar_width; uint32_t i_h = param->vui.i_sar_height; uint32_t old_w = h->param.vui.i_sar_width; uint32_t old_h = h->param.vui.i_sar_height; x264_reduce_fraction( &i_w, &i_h ); while( i_w > 65535 || i_h > 65535 ) { i_w /= 2; i_h /= 2; } x264_reduce_fraction( &i_w, &i_h ); if( i_w != old_w || i_h != old_h || initial ) { h->param.vui.i_sar_width = 0; h->param.vui.i_sar_height = 0; if( i_w == 0 || i_h == 0 ) x264_log( h, X264_LOG_WARNING, "cannot create valid sample aspect ratio\n" ); else { x264_log( h, initial?X264_LOG_INFO:X264_LOG_DEBUG, "using SAR=%d/%d\n", i_w, i_h ); h->param.vui.i_sar_width = i_w; h->param.vui.i_sar_height = i_h; } } } } /**************************************************************************** * x264_encoder_open: ****************************************************************************/ x264_t *x264_encoder_open( x264_param_t *param, void *api ) { x264_t *h; char buf[1000], *p; int i_slicetype_length; CHECKED_MALLOCZERO( h, sizeof(x264_t) ); /* Create a copy of param */ memcpy( &h->param, param, sizeof(x264_param_t) ); h->param.opaque = NULL; h->param.param_free = NULL; if( h->param.psz_cqm_file ) CHECKED_PARAM_STRDUP( h->param.psz_cqm_file, &h->param, h->param.psz_cqm_file ); if( h->param.psz_dump_yuv ) CHECKED_PARAM_STRDUP( h->param.psz_dump_yuv, &h->param, h->param.psz_dump_yuv ); if( h->param.rc.psz_stat_out ) CHECKED_PARAM_STRDUP( h->param.rc.psz_stat_out, &h->param, h->param.rc.psz_stat_out ); if( h->param.rc.psz_stat_in ) CHECKED_PARAM_STRDUP( h->param.rc.psz_stat_in, &h->param, h->param.rc.psz_stat_in ); if( h->param.rc.psz_zones ) CHECKED_PARAM_STRDUP( h->param.rc.psz_zones, &h->param, h->param.rc.psz_zones ); if( h->param.psz_clbin_file ) CHECKED_PARAM_STRDUP( h->param.psz_clbin_file, &h->param, h->param.psz_clbin_file ); if( param->param_free ) { x264_param_cleanup( param ); param->param_free( param ); } /* Save pointer to bit depth independent interface */ h->api = api; #if HAVE_INTEL_DISPATCHER x264_intel_dispatcher_override(); #endif if( x264_threading_init() ) { x264_log( h, X264_LOG_ERROR, "unable to initialize threading\n" ); goto fail; } if( validate_parameters( h, 1 ) < 0 ) goto fail; if( h->param.psz_cqm_file ) if( x264_cqm_parse_file( h, h->param.psz_cqm_file ) < 0 ) goto fail; x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den ); x264_reduce_fraction( &h->param.i_timebase_num, &h->param.i_timebase_den ); /* Init x264_t */ h->i_frame = -1; h->i_frame_num = 0; if( h->param.i_avcintra_class ) h->i_idr_pic_id = h->param.i_avcintra_class > 200 ? 4 : 5; else h->i_idr_pic_id = 0; if( (uint64_t)h->param.i_timebase_den * 2 > UINT32_MAX ) { x264_log( h, X264_LOG_ERROR, "Effective timebase denominator %u exceeds H.264 maximum\n", h->param.i_timebase_den ); goto fail; } set_aspect_ratio( h, &h->param, 1 ); x264_sps_init( h->sps, h->param.i_sps_id, &h->param ); x264_sps_init_scaling_list( h->sps, &h->param ); x264_pps_init( h->pps, h->param.i_sps_id, &h->param, h->sps ); x264_validate_levels( h, 1 ); h->chroma_qp_table = i_chroma_qp_table + 12 + h->pps->i_chroma_qp_index_offset; if( x264_cqm_init( h ) < 0 ) goto fail; h->mb.i_mb_width = h->sps->i_mb_width; h->mb.i_mb_height = h->sps->i_mb_height; h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height; h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422; h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420; /* Adaptive MBAFF and subme 0 are not supported as we require halving motion * vectors during prediction, resulting in hpel mvs. * The chosen solution is to make MBAFF non-adaptive in this case. */ h->mb.b_adaptive_mbaff = PARAM_INTERLACED && h->param.analyse.i_subpel_refine; /* Init frames. */ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && !h->param.rc.b_stat_read ) h->frames.i_delay = X264_MAX(h->param.i_bframe,3)*4; else h->frames.i_delay = h->param.i_bframe; if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ) h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead ); i_slicetype_length = h->frames.i_delay; h->frames.i_delay += h->i_thread_frames - 1; h->frames.i_delay += h->param.i_sync_lookahead; h->frames.i_delay += h->param.b_vfr_input; h->frames.i_bframe_delay = h->param.i_bframe ? (h->param.i_bframe_pyramid ? 2 : 1) : 0; h->frames.i_max_ref0 = h->param.i_frame_reference; h->frames.i_max_ref1 = X264_MIN( h->sps->vui.i_num_reorder_frames, h->param.i_frame_reference ); h->frames.i_max_dpb = h->sps->vui.i_max_dec_frame_buffering; h->frames.b_have_lowres = !h->param.rc.b_stat_read && ( h->param.rc.i_rc_method == X264_RC_ABR || h->param.rc.i_rc_method == X264_RC_CRF || h->param.i_bframe_adaptive || h->param.i_scenecut_threshold || h->param.rc.b_mb_tree || h->param.analyse.i_weighted_pred ); h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0; h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8); h->frames.i_last_idr = h->frames.i_last_keyframe = - h->param.i_keyint_max; h->frames.i_input = 0; h->frames.i_largest_pts = h->frames.i_second_largest_pts = -1; h->frames.i_poc_last_open_gop = -1; CHECKED_MALLOCZERO( h->cost_table, sizeof(*h->cost_table) ); CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) ); /* Allocate room for max refs plus a few extra just in case. */ CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + X264_REF_MAX + 4) * sizeof(x264_frame_t *) ); CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe + h->i_thread_frames + 3) * sizeof(x264_frame_t *) ); if( h->param.analyse.i_weighted_pred > 0 ) CHECKED_MALLOCZERO( h->frames.blank_unused, h->i_thread_frames * 4 * sizeof(x264_frame_t *) ); h->i_ref[0] = h->i_ref[1] = 0; h->i_cpb_delay = h->i_coded_fields = h->i_disp_fields = 0; h->i_prev_duration = ((uint64_t)h->param.i_fps_den * h->sps->vui.i_time_scale) / ((uint64_t)h->param.i_fps_num * h->sps->vui.i_num_units_in_tick); h->i_disp_fields_last_frame = -1; x264_rdo_init(); /* init CPU functions */ #if (ARCH_X86 || ARCH_X86_64) && HIGH_BIT_DEPTH /* FIXME: Only 8-bit has been optimized for AVX-512 so far. The few AVX-512 functions * enabled in high bit-depth are insignificant and just causes potential issues with * unnecessary thermal throttling and whatnot, so keep it disabled for now. */ h->param.cpu &= ~X264_CPU_AVX512; #endif x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); memcpy( &h->zigzagf, PARAM_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); x264_mc_init( h->param.cpu, &h->mc, h->param.b_cpu_independent ); x264_quant_init( h, h->param.cpu, &h->quantf ); x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED ); x264_bitstream_init( h->param.cpu, &h->bsf ); if( h->param.b_cabac ) x264_cabac_init( h ); else x264_cavlc_init( h ); mbcmp_init( h ); chroma_dsp_init( h ); p = buf + sprintf( buf, "using cpu capabilities:" ); for( int i = 0; x264_cpu_names[i].flags; i++ ) { if( !strcmp(x264_cpu_names[i].name, "SSE") && h->param.cpu & (X264_CPU_SSE2) ) continue; if( !strcmp(x264_cpu_names[i].name, "SSE2") && h->param.cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) ) continue; if( !strcmp(x264_cpu_names[i].name, "SSE3") && (h->param.cpu & X264_CPU_SSSE3 || !(h->param.cpu & X264_CPU_CACHELINE_64)) ) continue; if( !strcmp(x264_cpu_names[i].name, "SSE4.1") && (h->param.cpu & X264_CPU_SSE42) ) continue; if( !strcmp(x264_cpu_names[i].name, "LZCNT") && (h->param.cpu & X264_CPU_BMI1) ) continue; if( !strcmp(x264_cpu_names[i].name, "BMI1") && (h->param.cpu & X264_CPU_BMI2) ) continue; if( !strcmp(x264_cpu_names[i].name, "FMA4") && (h->param.cpu & X264_CPU_FMA3) ) continue; if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); } if( !h->param.cpu ) p += sprintf( p, " none!" ); x264_log( h, X264_LOG_INFO, "%s\n", buf ); if( x264_analyse_init_costs( h ) ) goto fail; /* Must be volatile or else GCC will optimize it out. */ volatile int temp = 392; if( x264_clz( temp ) != 23 ) { x264_log( h, X264_LOG_ERROR, "CLZ test failed: x264 has been miscompiled!\n" ); #if ARCH_X86 || ARCH_X86_64 x264_log( h, X264_LOG_ERROR, "Are you attempting to run an SSE4a/LZCNT-targeted build on a CPU that\n" ); x264_log( h, X264_LOG_ERROR, "doesn't support it?\n" ); #endif goto fail; } h->out.i_nal = 0; h->out.i_bitstream = x264_clip3f( h->param.i_width * h->param.i_height * 4 * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) : pow( 0.95, h->param.rc.i_qp_constant ) * X264_MAX( 1, h->param.rc.f_ip_factor ) ), 1000000, INT_MAX/3 ); h->nal_buffer_size = h->out.i_bitstream * 3/2 + 4 + 64; /* +4 for startcode, +64 for nal_escape assembly padding */ CHECKED_MALLOC( h->nal_buffer, h->nal_buffer_size ); CHECKED_MALLOC( h->reconfig_h, sizeof(x264_t) ); if( h->param.i_threads > 1 && x264_threadpool_init( &h->threadpool, h->param.i_threads ) ) goto fail; if( h->param.i_lookahead_threads > 1 && x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads ) ) goto fail; #if HAVE_OPENCL if( h->param.b_opencl ) { h->opencl.ocl = x264_opencl_load_library(); if( !h->opencl.ocl ) { x264_log( h, X264_LOG_WARNING, "failed to load OpenCL\n" ); h->param.b_opencl = 0; } } #endif h->thread[0] = h; for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ ) CHECKED_MALLOC( h->thread[i], sizeof(x264_t) ); if( h->param.i_lookahead_threads > 1 ) for( int i = 0; i < h->param.i_lookahead_threads; i++ ) { CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) ); *h->lookahead_thread[i] = *h; } *h->reconfig_h = *h; for( int i = 0; i < h->param.i_threads; i++ ) { int init_nal_count = h->param.i_slice_count + 3; int allocate_threadlocal_data = !h->param.b_sliced_threads || !i; if( i > 0 ) *h->thread[i] = *h; if( x264_pthread_mutex_init( &h->thread[i]->mutex, NULL ) ) goto fail; if( x264_pthread_cond_init( &h->thread[i]->cv, NULL ) ) goto fail; if( allocate_threadlocal_data ) { h->thread[i]->fdec = x264_frame_pop_unused( h, 1 ); if( !h->thread[i]->fdec ) goto fail; } else h->thread[i]->fdec = h->thread[0]->fdec; CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream ); /* Start each thread with room for init_nal_count NAL units; it'll realloc later if needed. */ CHECKED_MALLOC( h->thread[i]->out.nal, init_nal_count*sizeof(x264_nal_t) ); h->thread[i]->out.i_nals_allocated = init_nal_count; if( allocate_threadlocal_data && x264_macroblock_cache_allocate( h->thread[i] ) < 0 ) goto fail; } #if HAVE_OPENCL if( h->param.b_opencl && x264_opencl_lookahead_init( h ) < 0 ) h->param.b_opencl = 0; #endif if( x264_lookahead_init( h, i_slicetype_length ) ) goto fail; for( int i = 0; i < h->param.i_threads; i++ ) if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 ) goto fail; if( x264_ratecontrol_new( h ) < 0 ) goto fail; if( h->param.i_nal_hrd ) { x264_log( h, X264_LOG_DEBUG, "HRD bitrate: %i bits/sec\n", h->sps->vui.hrd.i_bit_rate_unscaled ); x264_log( h, X264_LOG_DEBUG, "CPB size: %i bits\n", h->sps->vui.hrd.i_cpb_size_unscaled ); } if( h->param.psz_dump_yuv ) { /* create or truncate the reconstructed video file */ FILE *f = x264_fopen( h->param.psz_dump_yuv, "w" ); if( !f ) { x264_log( h, X264_LOG_ERROR, "dump_yuv: can't write to %s\n", h->param.psz_dump_yuv ); goto fail; } else if( !x264_is_regular_file( f ) ) { x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv ); fclose( f ); goto fail; } fclose( f ); } const char *profile = h->sps->i_profile_idc == PROFILE_BASELINE ? "Constrained Baseline" : h->sps->i_profile_idc == PROFILE_MAIN ? "Main" : h->sps->i_profile_idc == PROFILE_HIGH ? "High" : h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 ? "High 10 Intra" : "High 10") : h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 ? "High 4:2:2 Intra" : "High 4:2:2") : h->sps->b_constraint_set3 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive"; char level[16]; if( h->sps->i_level_idc == 9 || ( h->sps->i_level_idc == 11 && h->sps->b_constraint_set3 && (h->sps->i_profile_idc == PROFILE_BASELINE || h->sps->i_profile_idc == PROFILE_MAIN) ) ) strcpy( level, "1b" ); else snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc / 10, h->sps->i_level_idc % 10 ); static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" }; x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s, %d-bit\n", profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH ); return h; fail: x264_free( h ); return NULL; } /****************************************************************************/ static int encoder_try_reconfig( x264_t *h, x264_param_t *param, int *rc_reconfig ) { *rc_reconfig = 0; set_aspect_ratio( h, param, 0 ); #define COPY(var) h->param.var = param->var COPY( i_frame_reference ); // but never uses more refs than initially specified COPY( i_bframe_bias ); if( h->param.i_scenecut_threshold ) COPY( i_scenecut_threshold ); // can't turn it on or off, only vary the threshold COPY( b_deblocking_filter ); COPY( i_deblocking_filter_alphac0 ); COPY( i_deblocking_filter_beta ); COPY( i_frame_packing ); COPY( mastering_display ); COPY( content_light_level ); COPY( i_alternative_transfer ); COPY( analyse.inter ); COPY( analyse.intra ); COPY( analyse.i_direct_mv_pred ); /* Scratch buffer prevents me_range from being increased for esa/tesa */ if( h->param.analyse.i_me_method < X264_ME_ESA || param->analyse.i_me_range < h->param.analyse.i_me_range ) COPY( analyse.i_me_range ); COPY( analyse.i_noise_reduction ); /* We can't switch out of subme=0 during encoding. */ if( h->param.analyse.i_subpel_refine ) COPY( analyse.i_subpel_refine ); COPY( analyse.i_trellis ); COPY( analyse.b_chroma_me ); COPY( analyse.b_dct_decimate ); COPY( analyse.b_fast_pskip ); COPY( analyse.b_mixed_references ); COPY( analyse.f_psy_rd ); COPY( analyse.f_psy_trellis ); COPY( crop_rect ); // can only twiddle these if they were enabled to begin with: if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA ) COPY( analyse.i_me_method ); if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa ) h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8; if( h->pps->b_transform_8x8_mode ) COPY( analyse.b_transform_8x8 ); if( h->frames.i_max_ref1 > 1 ) COPY( i_bframe_pyramid ); COPY( i_slice_max_size ); COPY( i_slice_max_mbs ); COPY( i_slice_min_mbs ); COPY( i_slice_count ); COPY( i_slice_count_max ); COPY( b_tff ); /* VBV can't be turned on if it wasn't on to begin with */ if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 && param->rc.i_vbv_max_bitrate > 0 && param->rc.i_vbv_buffer_size > 0 ) { *rc_reconfig |= h->param.rc.i_vbv_max_bitrate != param->rc.i_vbv_max_bitrate; *rc_reconfig |= h->param.rc.i_vbv_buffer_size != param->rc.i_vbv_buffer_size; *rc_reconfig |= h->param.rc.i_bitrate != param->rc.i_bitrate; COPY( rc.i_vbv_max_bitrate ); COPY( rc.i_vbv_buffer_size ); COPY( rc.i_bitrate ); } *rc_reconfig |= h->param.rc.f_rf_constant != param->rc.f_rf_constant; *rc_reconfig |= h->param.rc.f_rf_constant_max != param->rc.f_rf_constant_max; COPY( rc.f_rf_constant ); COPY( rc.f_rf_constant_max ); #undef COPY return validate_parameters( h, 0 ); } int x264_encoder_reconfig_apply( x264_t *h, x264_param_t *param ) { int rc_reconfig; int ret = encoder_try_reconfig( h, param, &rc_reconfig ); mbcmp_init( h ); if( !ret ) x264_sps_init_reconfigurable( h->sps, &h->param ); /* Supported reconfiguration options (1-pass only): * vbv-maxrate * vbv-bufsize * crf * bitrate (CBR only) */ if( !ret && rc_reconfig ) x264_ratecontrol_init_reconfigurable( h, 0 ); return ret; } /**************************************************************************** * x264_encoder_reconfig: ****************************************************************************/ int x264_encoder_reconfig( x264_t *h, x264_param_t *param ) { h = h->thread[h->thread[0]->i_thread_phase]; x264_param_t param_save = h->reconfig_h->param; h->reconfig_h->param = h->param; int rc_reconfig; int ret = encoder_try_reconfig( h->reconfig_h, param, &rc_reconfig ); if( !ret ) h->reconfig = 1; else h->reconfig_h->param = param_save; return ret; } /**************************************************************************** * x264_encoder_parameters: ****************************************************************************/ void x264_encoder_parameters( x264_t *h, x264_param_t *param ) { memcpy( param, &h->thread[h->i_thread_phase]->param, sizeof(x264_param_t) ); param->opaque = NULL; } /* internal usage */ static void nal_start( x264_t *h, int i_type, int i_ref_idc ) { x264_nal_t *nal = &h->out.nal[h->out.i_nal]; nal->i_ref_idc = i_ref_idc; nal->i_type = i_type; nal->b_long_startcode = 1; nal->i_payload= 0; nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8]; nal->i_padding= 0; } /* if number of allocated nals is not enough, re-allocate a larger one. */ static int nal_check_buffer( x264_t *h ) { if( h->out.i_nal >= h->out.i_nals_allocated ) { x264_nal_t *new_out = x264_malloc( sizeof(x264_nal_t) * (h->out.i_nals_allocated*2) ); if( !new_out ) return -1; memcpy( new_out, h->out.nal, sizeof(x264_nal_t) * (h->out.i_nals_allocated) ); x264_free( h->out.nal ); h->out.nal = new_out; h->out.i_nals_allocated *= 2; } return 0; } static int nal_end( x264_t *h ) { x264_nal_t *nal = &h->out.nal[h->out.i_nal]; uint8_t *end = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8]; nal->i_payload = end - nal->p_payload; /* Assembly implementation of nal_escape reads past the end of the input. * While undefined padding wouldn't actually affect the output, it makes valgrind unhappy. */ memset( end, 0xff, 64 ); if( h->param.nalu_process ) h->param.nalu_process( (x264_t *)h->api, nal, h->fenc->opaque ); h->out.i_nal++; return nal_check_buffer( h ); } static int check_encapsulated_buffer( x264_t *h, x264_t *h0, int start, int64_t previous_nal_size, int64_t necessary_size ) { if( h0->nal_buffer_size < necessary_size ) { necessary_size *= 2; if( necessary_size > INT_MAX ) return -1; uint8_t *buf = x264_malloc( necessary_size ); if( !buf ) return -1; if( previous_nal_size ) memcpy( buf, h0->nal_buffer, previous_nal_size ); intptr_t delta = buf - h0->nal_buffer; for( int i = 0; i < start; i++ ) h->out.nal[i].p_payload += delta; x264_free( h0->nal_buffer ); h0->nal_buffer = buf; h0->nal_buffer_size = necessary_size; } return 0; } static int encoder_encapsulate_nals( x264_t *h, int start ) { x264_t *h0 = h->thread[0]; int64_t nal_size = 0, previous_nal_size = 0; if( h->param.nalu_process ) { for( int i = start; i < h->out.i_nal; i++ ) nal_size += h->out.nal[i].i_payload; if( nal_size > INT_MAX ) return -1; return nal_size; } for( int i = 0; i < start; i++ ) previous_nal_size += h->out.nal[i].i_payload; for( int i = start; i < h->out.i_nal; i++ ) nal_size += h->out.nal[i].i_payload; /* Worst-case NAL unit escaping: reallocate the buffer if it's too small. */ int64_t necessary_size = previous_nal_size + nal_size * 3/2 + h->out.i_nal * 4 + 4 + 64; for( int i = start; i < h->out.i_nal; i++ ) necessary_size += h->out.nal[i].i_padding; if( check_encapsulated_buffer( h, h0, start, previous_nal_size, necessary_size ) ) return -1; uint8_t *nal_buffer = h0->nal_buffer + previous_nal_size; for( int i = start; i < h->out.i_nal; i++ ) { h->out.nal[i].b_long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS || h->param.i_avcintra_class; x264_nal_encode( h, nal_buffer, &h->out.nal[i] ); nal_buffer += h->out.nal[i].i_payload; } x264_emms(); return nal_buffer - (h0->nal_buffer + previous_nal_size); } /**************************************************************************** * x264_encoder_headers: ****************************************************************************/ int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) { int frame_size = 0; /* init bitstream context */ h->out.i_nal = 0; bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream ); /* Write SEI, SPS and PPS. */ /* generate sequence parameters */ nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST ); x264_sps_write( &h->out.bs, h->sps ); if( nal_end( h ) ) return -1; /* generate picture parameters */ nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST ); x264_pps_write( &h->out.bs, h->sps, h->pps ); if( nal_end( h ) ) return -1; /* identify ourselves */ nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); if( x264_sei_version_write( h, &h->out.bs ) ) return -1; if( nal_end( h ) ) return -1; frame_size = encoder_encapsulate_nals( h, 0 ); if( frame_size < 0 ) return -1; /* now set output*/ *pi_nal = h->out.i_nal; *pp_nal = &h->out.nal[0]; h->out.i_nal = 0; return frame_size; } /* Check to see whether we have chosen a reference list ordering different * from the standard's default. */ static inline void reference_check_reorder( x264_t *h ) { /* The reorder check doesn't check for missing frames, so just * force a reorder if one of the reference list is corrupt. */ for( int i = 0; h->frames.reference[i]; i++ ) if( h->frames.reference[i]->b_corrupt ) { h->b_ref_reorder[0] = 1; return; } for( int list = 0; list <= (h->sh.i_type == SLICE_TYPE_B); list++ ) for( int i = 0; i < h->i_ref[list] - 1; i++ ) { int framenum_diff = h->fref[list][i+1]->i_frame_num - h->fref[list][i]->i_frame_num; int poc_diff = h->fref[list][i+1]->i_poc - h->fref[list][i]->i_poc; /* P and B-frames use different default orders. */ if( h->sh.i_type == SLICE_TYPE_P ? framenum_diff > 0 : list == 1 ? poc_diff < 0 : poc_diff > 0 ) { h->b_ref_reorder[list] = 1; return; } } } /* return -1 on failure, else return the index of the new reference frame */ static int weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w ) { int i = h->i_ref[0]; int j = 1; x264_frame_t *newframe; if( i <= 1 ) /* empty list, definitely can't duplicate frame */ return -1; //Duplication is only used in X264_WEIGHTP_SMART if( h->param.analyse.i_weighted_pred != X264_WEIGHTP_SMART ) return -1; /* Duplication is a hack to compensate for crappy rounding in motion compensation. * With high bit depth, it's not worth doing, so turn it off except in the case of * unweighted dupes. */ if( BIT_DEPTH > 8 && w != x264_weight_none ) return -1; newframe = x264_frame_pop_blank_unused( h ); if( !newframe ) return -1; //FIXME: probably don't need to copy everything *newframe = *h->fref[0][i_ref]; newframe->i_reference_count = 1; newframe->orig = h->fref[0][i_ref]; newframe->b_duplicate = 1; memcpy( h->fenc->weight[j], w, sizeof(h->fenc->weight[i]) ); /* shift the frames to make space for the dupe. */ h->b_ref_reorder[0] = 1; if( h->i_ref[0] < X264_REF_MAX ) ++h->i_ref[0]; h->fref[0][X264_REF_MAX-1] = NULL; x264_frame_unshift( &h->fref[0][j], newframe ); return j; } static void weighted_pred_init( x264_t *h ) { /* for now no analysis and set all weights to nothing */ for( int i_ref = 0; i_ref < h->i_ref[0]; i_ref++ ) h->fenc->weighted[i_ref] = h->fref[0][i_ref]->filtered[0][0]; // FIXME: This only supports weighting of one reference frame // and duplicates of that frame. h->fenc->i_lines_weighted = 0; for( int i_ref = 0; i_ref < (h->i_ref[0] << SLICE_MBAFF); i_ref++ ) for( int i = 0; i < 3; i++ ) h->sh.weight[i_ref][i].weightfn = NULL; if( h->sh.i_type != SLICE_TYPE_P || h->param.analyse.i_weighted_pred <= 0 ) return; int i_padv = PADV << PARAM_INTERLACED; int denom = -1; int weightplane[2] = { 0, 0 }; int buffer_next = 0; for( int i = 0; i < 3; i++ ) { for( int j = 0; j < h->i_ref[0]; j++ ) { if( h->fenc->weight[j][i].weightfn ) { h->sh.weight[j][i] = h->fenc->weight[j][i]; // if weight is useless, don't write it to stream if( h->sh.weight[j][i].i_scale == 1<sh.weight[j][i].i_denom && h->sh.weight[j][i].i_offset == 0 ) h->sh.weight[j][i].weightfn = NULL; else { if( !weightplane[!!i] ) { weightplane[!!i] = 1; h->sh.weight[0][!!i].i_denom = denom = h->sh.weight[j][i].i_denom; assert( x264_clip3( denom, 0, 7 ) == denom ); } assert( h->sh.weight[j][i].i_denom == denom ); if( !i ) { h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH_ALIGN; //scale full resolution frame if( h->param.i_threads == 1 ) { pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH_ALIGN; pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN; int stride = h->fenc->i_stride[0]; int width = h->fenc->i_width[0] + PADH2; int height = h->fenc->i_lines[0] + i_padv*2; x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] ); h->fenc->i_lines_weighted = height; } } } } } } if( weightplane[1] ) for( int i = 0; i < h->i_ref[0]; i++ ) { if( h->sh.weight[i][1].weightfn && !h->sh.weight[i][2].weightfn ) { h->sh.weight[i][2].i_scale = 1 << h->sh.weight[0][1].i_denom; h->sh.weight[i][2].i_offset = 0; } else if( h->sh.weight[i][2].weightfn && !h->sh.weight[i][1].weightfn ) { h->sh.weight[i][1].i_scale = 1 << h->sh.weight[0][1].i_denom; h->sh.weight[i][1].i_offset = 0; } } if( !weightplane[0] ) h->sh.weight[0][0].i_denom = 0; if( !weightplane[1] ) h->sh.weight[0][1].i_denom = 0; h->sh.weight[0][2].i_denom = h->sh.weight[0][1].i_denom; } static inline int reference_distance( x264_t *h, x264_frame_t *frame ) { if( h->param.i_frame_packing == 5 ) return abs((h->fenc->i_frame&~1) - (frame->i_frame&~1)) + ((h->fenc->i_frame&1) != (frame->i_frame&1)); else return abs(h->fenc->i_frame - frame->i_frame); } static inline void reference_build_list( x264_t *h, int i_poc ) { int b_ok; /* build ref list 0/1 */ h->mb.pic.i_fref[0] = h->i_ref[0] = 0; h->mb.pic.i_fref[1] = h->i_ref[1] = 0; if( h->sh.i_type == SLICE_TYPE_I ) return; for( int i = 0; h->frames.reference[i]; i++ ) { if( h->frames.reference[i]->b_corrupt ) continue; if( h->frames.reference[i]->i_poc < i_poc ) h->fref[0][h->i_ref[0]++] = h->frames.reference[i]; else if( h->frames.reference[i]->i_poc > i_poc ) h->fref[1][h->i_ref[1]++] = h->frames.reference[i]; } if( h->sh.i_mmco_remove_from_end ) { /* Order ref0 for MMCO remove */ do { b_ok = 1; for( int i = 0; i < h->i_ref[0] - 1; i++ ) { if( h->fref[0][i]->i_frame < h->fref[0][i+1]->i_frame ) { XCHG( x264_frame_t*, h->fref[0][i], h->fref[0][i+1] ); b_ok = 0; break; } } } while( !b_ok ); for( int i = h->i_ref[0]-1; i >= h->i_ref[0] - h->sh.i_mmco_remove_from_end; i-- ) { int diff = h->i_frame_num - h->fref[0][i]->i_frame_num; h->sh.mmco[h->sh.i_mmco_command_count].i_poc = h->fref[0][i]->i_poc; h->sh.mmco[h->sh.i_mmco_command_count++].i_difference_of_pic_nums = diff; } } /* Order reference lists by distance from the current frame. */ for( int list = 0; list < 2; list++ ) { h->fref_nearest[list] = h->fref[list][0]; do { b_ok = 1; for( int i = 0; i < h->i_ref[list] - 1; i++ ) { if( list ? h->fref[list][i+1]->i_poc < h->fref_nearest[list]->i_poc : h->fref[list][i+1]->i_poc > h->fref_nearest[list]->i_poc ) h->fref_nearest[list] = h->fref[list][i+1]; if( reference_distance( h, h->fref[list][i] ) > reference_distance( h, h->fref[list][i+1] ) ) { XCHG( x264_frame_t*, h->fref[list][i], h->fref[list][i+1] ); b_ok = 0; break; } } } while( !b_ok ); } reference_check_reorder( h ); h->i_ref[1] = X264_MIN( h->i_ref[1], h->frames.i_max_ref1 ); h->i_ref[0] = X264_MIN( h->i_ref[0], h->frames.i_max_ref0 ); h->i_ref[0] = X264_MIN( h->i_ref[0], h->param.i_frame_reference ); // if reconfig() has lowered the limit /* For Blu-ray compliance, don't reference frames outside of the minigop. */ if( IS_X264_TYPE_B( h->fenc->i_type ) && h->param.b_bluray_compat ) h->i_ref[0] = X264_MIN( h->i_ref[0], IS_X264_TYPE_B( h->fref[0][0]->i_type ) + 1 ); /* add duplicates */ if( h->fenc->i_type == X264_TYPE_P ) { int idx = -1; if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE ) { x264_weight_t w[3]; w[1].weightfn = w[2].weightfn = NULL; if( h->param.rc.b_stat_read ) x264_ratecontrol_set_weights( h, h->fenc ); if( !h->fenc->weight[0][0].weightfn ) { h->fenc->weight[0][0].i_denom = 0; SET_WEIGHT( w[0], 1, 1, 0, -1 ); idx = weighted_reference_duplicate( h, 0, w ); } else { if( h->fenc->weight[0][0].i_scale == 1<fenc->weight[0][0].i_denom ) { SET_WEIGHT( h->fenc->weight[0][0], 1, 1, 0, h->fenc->weight[0][0].i_offset ); } weighted_reference_duplicate( h, 0, x264_weight_none ); if( h->fenc->weight[0][0].i_offset > -128 ) { w[0] = h->fenc->weight[0][0]; w[0].i_offset--; h->mc.weight_cache( h, &w[0] ); idx = weighted_reference_duplicate( h, 0, w ); } } } h->mb.ref_blind_dupe = idx; } assert( h->i_ref[0] + h->i_ref[1] <= X264_REF_MAX ); h->mb.pic.i_fref[0] = h->i_ref[0]; h->mb.pic.i_fref[1] = h->i_ref[1]; } static void fdec_filter_row( x264_t *h, int mb_y, int pass ) { /* mb_y is the mb to be encoded next, not the mb to be filtered here */ int b_hpel = h->fdec->b_kept_as_ref; int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_end = mb_y == h->i_threadslice_end; int b_measure_quality = 1; int min_y = mb_y - (1 << SLICE_MBAFF); int b_start = min_y == h->i_threadslice_start; /* Even in interlaced mode, deblocking never modifies more than 4 pixels * above each MB, as bS=4 doesn't happen for the top of interlaced mbpairs. */ int minpix_y = min_y*16 - 4 * !b_start; int maxpix_y = mb_y*16 - 4 * !b_end; b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv; if( h->param.b_sliced_threads ) { switch( pass ) { /* During encode: only do deblock if asked for */ default: case 0: b_deblock &= h->param.b_full_recon; b_hpel = 0; break; /* During post-encode pass: do deblock if not done yet, do hpel for all * rows except those between slices. */ case 1: b_deblock &= !h->param.b_full_recon; b_hpel &= !(b_start && min_y > 0); b_measure_quality = 0; break; /* Final pass: do the rows between slices in sequence. */ case 2: b_deblock = 0; b_measure_quality = 0; break; } } if( mb_y & SLICE_MBAFF ) return; if( min_y < h->i_threadslice_start ) return; if( b_deblock ) for( int y = min_y; y < mb_y; y += (1 << SLICE_MBAFF) ) x264_frame_deblock_row( h, y ); /* FIXME: Prediction requires different borders for interlaced/progressive mc, * but the actual image data is equivalent. For now, maintain this * consistency by copying deblocked pixels between planes. */ if( PARAM_INTERLACED && (!h->param.b_sliced_threads || pass == 1) ) for( int p = 0; p < h->fdec->i_plane; p++ ) for( int i = minpix_y>>(CHROMA_V_SHIFT && p); i < maxpix_y>>(CHROMA_V_SHIFT && p); i++ ) memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p], h->fdec->plane[p] + i*h->fdec->i_stride[p], h->mb.i_mb_width*16*SIZEOF_PIXEL ); if( h->fdec->b_kept_as_ref && (!h->param.b_sliced_threads || pass == 1) ) x264_frame_expand_border( h, h->fdec, min_y ); if( b_hpel ) { int end = mb_y == h->mb.i_mb_height; /* Can't do hpel until the previous slice is done encoding. */ if( h->param.analyse.i_subpel_refine ) { x264_frame_filter( h, h->fdec, min_y, end ); x264_frame_expand_border_filtered( h, h->fdec, min_y, end ); } } if( SLICE_MBAFF && pass == 0 ) for( int i = 0; i < 3; i++ ) { XCHG( pixel *, h->intra_border_backup[0][i], h->intra_border_backup[3][i] ); XCHG( pixel *, h->intra_border_backup[1][i], h->intra_border_backup[4][i] ); } if( h->i_thread_frames > 1 && h->fdec->b_kept_as_ref ) x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << SLICE_MBAFF)) ); if( b_measure_quality ) { maxpix_y = X264_MIN( maxpix_y, h->param.i_height ); if( h->param.analyse.b_psnr ) { for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) h->stat.frame.i_ssd[p] += x264_pixel_ssd_wxh( &h->pixf, h->fdec->plane[p] + minpix_y * h->fdec->i_stride[p], h->fdec->i_stride[p], h->fenc->plane[p] + minpix_y * h->fenc->i_stride[p], h->fenc->i_stride[p], h->param.i_width, maxpix_y-minpix_y ); if( !CHROMA444 ) { uint64_t ssd_u, ssd_v; int v_shift = CHROMA_V_SHIFT; x264_pixel_ssd_nv12( &h->pixf, h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1], h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1], h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v ); h->stat.frame.i_ssd[1] += ssd_u; h->stat.frame.i_ssd[2] += ssd_v; } } if( h->param.analyse.b_ssim ) { int ssim_cnt; x264_emms(); /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks, * and overlap by 4 */ minpix_y += b_start ? 2 : -6; h->stat.frame.f_ssim += x264_pixel_ssim_wxh( &h->pixf, h->fdec->plane[0] + 2+minpix_y*h->fdec->i_stride[0], h->fdec->i_stride[0], h->fenc->plane[0] + 2+minpix_y*h->fenc->i_stride[0], h->fenc->i_stride[0], h->param.i_width-2, maxpix_y-minpix_y, h->scratch_buffer, &ssim_cnt ); h->stat.frame.i_ssim_cnt += ssim_cnt; } } } static inline int reference_update( x264_t *h ) { if( !h->fdec->b_kept_as_ref ) { if( h->i_thread_frames > 1 ) { x264_frame_push_unused( h, h->fdec ); h->fdec = x264_frame_pop_unused( h, 1 ); if( !h->fdec ) return -1; } return 0; } /* apply mmco from previous frame. */ for( int i = 0; i < h->sh.i_mmco_command_count; i++ ) for( int j = 0; h->frames.reference[j]; j++ ) if( h->frames.reference[j]->i_poc == h->sh.mmco[i].i_poc ) x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[j] ) ); /* move frame in the buffer */ x264_frame_push( h->frames.reference, h->fdec ); if( h->frames.reference[h->sps->i_num_ref_frames] ) x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) ); h->fdec = x264_frame_pop_unused( h, 1 ); if( !h->fdec ) return -1; return 0; } static inline void reference_reset( x264_t *h ) { while( h->frames.reference[0] ) x264_frame_push_unused( h, x264_frame_pop( h->frames.reference ) ); h->fdec->i_poc = h->fenc->i_poc = 0; } static inline void reference_hierarchy_reset( x264_t *h ) { int ref; int b_hasdelayframe = 0; /* look for delay frames -- chain must only contain frames that are disposable */ for( int i = 0; h->frames.current[i] && IS_DISPOSABLE( h->frames.current[i]->i_type ); i++ ) b_hasdelayframe |= h->frames.current[i]->i_coded != h->frames.current[i]->i_frame + h->sps->vui.i_num_reorder_frames; /* This function must handle b-pyramid and clear frames for open-gop */ if( h->param.i_bframe_pyramid != X264_B_PYRAMID_STRICT && !b_hasdelayframe && h->frames.i_poc_last_open_gop == -1 ) return; /* Remove last BREF. There will never be old BREFs in the * dpb during a BREF decode when pyramid == STRICT */ for( ref = 0; h->frames.reference[ref]; ref++ ) { if( ( h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT && h->frames.reference[ref]->i_type == X264_TYPE_BREF ) || ( h->frames.reference[ref]->i_poc < h->frames.i_poc_last_open_gop && h->sh.i_type != SLICE_TYPE_B ) ) { int diff = h->i_frame_num - h->frames.reference[ref]->i_frame_num; h->sh.mmco[h->sh.i_mmco_command_count].i_difference_of_pic_nums = diff; h->sh.mmco[h->sh.i_mmco_command_count++].i_poc = h->frames.reference[ref]->i_poc; x264_frame_push_unused( h, x264_frame_shift( &h->frames.reference[ref] ) ); h->b_ref_reorder[0] = 1; ref--; } } /* Prepare room in the dpb for the delayed display time of the later b-frame's */ if( h->param.i_bframe_pyramid ) h->sh.i_mmco_remove_from_end = X264_MAX( ref + 2 - h->frames.i_max_dpb, 0 ); } static inline void slice_init( x264_t *h, int i_nal_type, int i_global_qp ) { /* ------------------------ Create slice header ----------------------- */ if( i_nal_type == NAL_SLICE_IDR ) { slice_header_init( h, &h->sh, h->sps, h->pps, h->i_idr_pic_id, h->i_frame_num, i_global_qp ); /* alternate id */ if( h->param.i_avcintra_class ) { switch( h->i_idr_pic_id ) { case 5: h->i_idr_pic_id = 3; break; case 3: h->i_idr_pic_id = 4; break; case 4: default: h->i_idr_pic_id = 5; break; } } else h->i_idr_pic_id ^= 1; } else { slice_header_init( h, &h->sh, h->sps, h->pps, -1, h->i_frame_num, i_global_qp ); h->sh.i_num_ref_idx_l0_active = h->i_ref[0] <= 0 ? 1 : h->i_ref[0]; h->sh.i_num_ref_idx_l1_active = h->i_ref[1] <= 0 ? 1 : h->i_ref[1]; if( h->sh.i_num_ref_idx_l0_active != h->pps->i_num_ref_idx_l0_default_active || (h->sh.i_type == SLICE_TYPE_B && h->sh.i_num_ref_idx_l1_active != h->pps->i_num_ref_idx_l1_default_active) ) { h->sh.b_num_ref_idx_override = 1; } } if( h->fenc->i_type == X264_TYPE_BREF && h->param.b_bluray_compat && h->sh.i_mmco_command_count ) { h->b_sh_backup = 1; h->sh_backup = h->sh; } h->fdec->i_frame_num = h->sh.i_frame_num; if( h->sps->i_poc_type == 0 ) { h->sh.i_poc = h->fdec->i_poc; if( PARAM_INTERLACED ) { h->sh.i_delta_poc_bottom = h->param.b_tff ? 1 : -1; h->sh.i_poc += h->sh.i_delta_poc_bottom == -1; } else h->sh.i_delta_poc_bottom = 0; h->fdec->i_delta_poc[0] = h->sh.i_delta_poc_bottom == -1; h->fdec->i_delta_poc[1] = h->sh.i_delta_poc_bottom == 1; } else { /* Nothing to do ? */ } x264_macroblock_slice_init( h ); } typedef struct { int skip; uint8_t cabac_prevbyte; bs_t bs; x264_cabac_t cabac; x264_frame_stat_t stat; int last_qp; int last_dqp; int field_decoding_flag; } x264_bs_bak_t; static ALWAYS_INLINE void bitstream_backup( x264_t *h, x264_bs_bak_t *bak, int i_skip, int full ) { if( full ) { bak->stat = h->stat.frame; bak->last_qp = h->mb.i_last_qp; bak->last_dqp = h->mb.i_last_dqp; bak->field_decoding_flag = h->mb.field_decoding_flag; } else { bak->stat.i_mv_bits = h->stat.frame.i_mv_bits; bak->stat.i_tex_bits = h->stat.frame.i_tex_bits; } /* In the per-MB backup, we don't need the contexts because flushing the CABAC * encoder has no context dependency and in this case, a slice is ended (and * thus the content of all contexts are thrown away). */ if( h->param.b_cabac ) { if( full ) memcpy( &bak->cabac, &h->cabac, sizeof(x264_cabac_t) ); else memcpy( &bak->cabac, &h->cabac, offsetof(x264_cabac_t, f8_bits_encoded) ); /* x264's CABAC writer modifies the previous byte during carry, so it has to be * backed up. */ bak->cabac_prevbyte = h->cabac.p[-1]; } else { bak->bs = h->out.bs; bak->skip = i_skip; } } static ALWAYS_INLINE void bitstream_restore( x264_t *h, x264_bs_bak_t *bak, int *skip, int full ) { if( full ) { h->stat.frame = bak->stat; h->mb.i_last_qp = bak->last_qp; h->mb.i_last_dqp = bak->last_dqp; h->mb.field_decoding_flag = bak->field_decoding_flag; } else { h->stat.frame.i_mv_bits = bak->stat.i_mv_bits; h->stat.frame.i_tex_bits = bak->stat.i_tex_bits; } if( h->param.b_cabac ) { if( full ) memcpy( &h->cabac, &bak->cabac, sizeof(x264_cabac_t) ); else memcpy( &h->cabac, &bak->cabac, offsetof(x264_cabac_t, f8_bits_encoded) ); h->cabac.p[-1] = bak->cabac_prevbyte; } else { h->out.bs = bak->bs; *skip = bak->skip; } } static intptr_t slice_write( x264_t *h ) { int i_skip; int mb_xy, i_mb_x, i_mb_y; /* NALUs other than the first use a 3-byte startcode. * Add one extra byte for the rbsp, and one more for the final CABAC putbyte. * Then add an extra 5 bytes just in case, to account for random NAL escapes and * other inaccuracies. */ int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5; int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : 0; int back_up_bitstream_cavlc = !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH; int back_up_bitstream = slice_max_size || back_up_bitstream_cavlc; int starting_bits = bs_pos(&h->out.bs); int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_hpel = h->fdec->b_kept_as_ref; int orig_last_mb = h->sh.i_last_mb; int thread_last_mb = h->i_threadslice_end * h->mb.i_mb_width - 1; uint8_t *last_emu_check; #define BS_BAK_SLICE_MAX_SIZE 0 #define BS_BAK_CAVLC_OVERFLOW 1 #define BS_BAK_SLICE_MIN_MBS 2 #define BS_BAK_ROW_VBV 3 x264_bs_bak_t bs_bak[4]; b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv; bs_realign( &h->out.bs ); /* Slice */ nal_start( h, h->i_nal_type, h->i_nal_ref_idc ); h->out.nal[h->out.i_nal].i_first_mb = h->sh.i_first_mb; /* Slice header */ x264_macroblock_thread_init( h ); /* Set the QP equal to the first QP in the slice for more accurate CABAC initialization. */ h->mb.i_mb_xy = h->sh.i_first_mb; h->sh.i_qp = x264_ratecontrol_mb_qp( h ); h->sh.i_qp = SPEC_QP( h->sh.i_qp ); h->sh.i_qp_delta = h->sh.i_qp - h->pps->i_pic_init_qp; slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc ); if( h->param.b_cabac ) { /* alignment needed */ bs_align_1( &h->out.bs ); /* init cabac */ x264_cabac_context_init( h, &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc ); x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end ); last_emu_check = h->cabac.p; } else last_emu_check = h->out.bs.p; h->mb.i_last_qp = h->sh.i_qp; h->mb.i_last_dqp = 0; h->mb.field_decoding_flag = 0; i_mb_y = h->sh.i_first_mb / h->mb.i_mb_width; i_mb_x = h->sh.i_first_mb % h->mb.i_mb_width; i_skip = 0; while( 1 ) { mb_xy = i_mb_x + i_mb_y * h->mb.i_mb_width; int mb_spos = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac); if( i_mb_x == 0 ) { if( bitstream_check_buffer( h ) ) return -1; if( !(i_mb_y & SLICE_MBAFF) && h->param.rc.i_vbv_buffer_size ) bitstream_backup( h, &bs_bak[BS_BAK_ROW_VBV], i_skip, 1 ); if( !h->mb.b_reencode_mb ) fdec_filter_row( h, i_mb_y, 0 ); } if( back_up_bitstream ) { if( back_up_bitstream_cavlc ) bitstream_backup( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], i_skip, 0 ); if( slice_max_size && !(i_mb_y & SLICE_MBAFF) ) { bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 ); if( (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs ) bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 ); } } if( PARAM_INTERLACED ) { if( h->mb.b_adaptive_mbaff ) { if( !(i_mb_y&1) ) { /* FIXME: VSAD is fast but fairly poor at choosing the best interlace type. */ h->mb.b_interlaced = x264_field_vsad( h, i_mb_x, i_mb_y ); memcpy( &h->zigzagf, MB_INTERLACED ? &h->zigzagf_interlaced : &h->zigzagf_progressive, sizeof(h->zigzagf) ); if( !MB_INTERLACED && (i_mb_y+2) == h->mb.i_mb_height ) x264_expand_border_mbpair( h, i_mb_x, i_mb_y ); } } h->mb.field[mb_xy] = MB_INTERLACED; } /* load cache */ if( SLICE_MBAFF ) x264_macroblock_cache_load_interlaced( h, i_mb_x, i_mb_y ); else x264_macroblock_cache_load_progressive( h, i_mb_x, i_mb_y ); x264_macroblock_analyse( h ); /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */ reencode: x264_macroblock_encode( h ); if( h->param.b_cabac ) { if( mb_xy > h->sh.i_first_mb && !(SLICE_MBAFF && (i_mb_y&1)) ) x264_cabac_encode_terminal( &h->cabac ); if( IS_SKIP( h->mb.i_type ) ) x264_cabac_mb_skip( h, 1 ); else { if( h->sh.i_type != SLICE_TYPE_I ) x264_cabac_mb_skip( h, 0 ); x264_macroblock_write_cabac( h, &h->cabac ); } } else { if( IS_SKIP( h->mb.i_type ) ) i_skip++; else { if( h->sh.i_type != SLICE_TYPE_I ) { bs_write_ue( &h->out.bs, i_skip ); /* skip run */ i_skip = 0; } x264_macroblock_write_cavlc( h ); /* If there was a CAVLC level code overflow, try again at a higher QP. */ if( h->mb.b_overflow ) { h->mb.i_chroma_qp = h->chroma_qp_table[++h->mb.i_qp]; h->mb.i_skip_intra = 0; h->mb.b_skip_mc = 0; h->mb.b_overflow = 0; bitstream_restore( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], &i_skip, 0 ); goto reencode; } } } int total_bits = bs_pos(&h->out.bs) + x264_cabac_pos(&h->cabac); int mb_size = total_bits - mb_spos; if( slice_max_size && (!SLICE_MBAFF || (i_mb_y&1)) ) { /* Count the skip run, just in case. */ if( !h->param.b_cabac ) total_bits += bs_size_ue_big( i_skip ); /* Check for escape bytes. */ uint8_t *end = h->param.b_cabac ? h->cabac.p : h->out.bs.p; for( ; last_emu_check < end - 2; last_emu_check++ ) if( last_emu_check[0] == 0 && last_emu_check[1] == 0 && last_emu_check[2] <= 3 ) { slice_max_size -= 8; last_emu_check++; } /* We'll just re-encode this last macroblock if we go over the max slice size. */ if( total_bits - starting_bits > slice_max_size && !h->mb.b_reencode_mb ) { if( !x264_frame_new_slice( h, h->fdec ) ) { /* Handle the most obnoxious slice-min-mbs edge case: we need to end the slice * because it's gone over the maximum size, but doing so would violate slice-min-mbs. * If possible, roll back to the last checkpoint and try again. * We could try raising QP, but that would break in the case where a slice spans multiple * rows, which the re-encoding infrastructure can't currently handle. */ if( mb_xy <= thread_last_mb && (thread_last_mb+1-mb_xy) < h->param.i_slice_min_mbs ) { if( thread_last_mb-h->param.i_slice_min_mbs < h->sh.i_first_mb+h->param.i_slice_min_mbs ) { x264_log( h, X264_LOG_WARNING, "slice-max-size violated (frame %d, cause: slice-min-mbs)\n", h->i_frame ); slice_max_size = 0; goto cont; } bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], &i_skip, 0 ); h->mb.b_reencode_mb = 1; h->sh.i_last_mb = thread_last_mb-h->param.i_slice_min_mbs; break; } if( mb_xy-SLICE_MBAFF*h->mb.i_mb_stride != h->sh.i_first_mb ) { bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 ); h->mb.b_reencode_mb = 1; if( SLICE_MBAFF ) { // set to bottom of previous mbpair if( i_mb_x ) h->sh.i_last_mb = mb_xy-1+h->mb.i_mb_stride*(!(i_mb_y&1)); else h->sh.i_last_mb = (i_mb_y-2+!(i_mb_y&1))*h->mb.i_mb_stride + h->mb.i_mb_width - 1; } else h->sh.i_last_mb = mb_xy-1; break; } else h->sh.i_last_mb = mb_xy; } else slice_max_size = 0; } } cont: h->mb.b_reencode_mb = 0; /* save cache */ x264_macroblock_cache_save( h ); if( x264_ratecontrol_mb( h, mb_size ) < 0 ) { bitstream_restore( h, &bs_bak[BS_BAK_ROW_VBV], &i_skip, 1 ); h->mb.b_reencode_mb = 1; i_mb_x = 0; i_mb_y = i_mb_y - SLICE_MBAFF; h->mb.i_mb_prev_xy = i_mb_y * h->mb.i_mb_stride - 1; h->sh.i_last_mb = orig_last_mb; continue; } /* accumulate mb stats */ h->stat.frame.i_mb_count[h->mb.i_type]++; int b_intra = IS_INTRA( h->mb.i_type ); int b_skip = IS_SKIP( h->mb.i_type ); if( h->param.i_log_level >= X264_LOG_INFO || h->param.rc.b_stat_write ) { if( !b_intra && !b_skip && !IS_DIRECT( h->mb.i_type ) ) { if( h->mb.i_partition != D_8x8 ) h->stat.frame.i_mb_partition[h->mb.i_partition] += 4; else for( int i = 0; i < 4; i++ ) h->stat.frame.i_mb_partition[h->mb.i_sub_partition[i]] ++; if( h->param.i_frame_reference > 1 ) for( int i_list = 0; i_list <= (h->sh.i_type == SLICE_TYPE_B); i_list++ ) for( int i = 0; i < 4; i++ ) { int i_ref = h->mb.cache.ref[i_list][ x264_scan8[4*i] ]; if( i_ref >= 0 ) h->stat.frame.i_mb_count_ref[i_list][i_ref] ++; } } } if( h->param.i_log_level >= X264_LOG_INFO ) { if( h->mb.i_cbp_luma | h->mb.i_cbp_chroma ) { if( CHROMA444 ) { for( int i = 0; i < 4; i++ ) if( h->mb.i_cbp_luma & (1 << i) ) for( int p = 0; p < 3; p++ ) { int s8 = i*4+p*16; int nnz8x8 = M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+0] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[s8]+8] ); h->stat.frame.i_mb_cbp[!b_intra + p*2] += !!nnz8x8; } } else { int cbpsum = (h->mb.i_cbp_luma&1) + ((h->mb.i_cbp_luma>>1)&1) + ((h->mb.i_cbp_luma>>2)&1) + (h->mb.i_cbp_luma>>3); h->stat.frame.i_mb_cbp[!b_intra + 0] += cbpsum; h->stat.frame.i_mb_cbp[!b_intra + 2] += !!h->mb.i_cbp_chroma; h->stat.frame.i_mb_cbp[!b_intra + 4] += h->mb.i_cbp_chroma >> 1; } } if( h->mb.i_cbp_luma && !b_intra ) { h->stat.frame.i_mb_count_8x8dct[0] ++; h->stat.frame.i_mb_count_8x8dct[1] += h->mb.b_transform_8x8; } if( b_intra && h->mb.i_type != I_PCM ) { if( h->mb.i_type == I_16x16 ) h->stat.frame.i_mb_pred_mode[0][h->mb.i_intra16x16_pred_mode]++; else if( h->mb.i_type == I_8x8 ) for( int i = 0; i < 16; i += 4 ) h->stat.frame.i_mb_pred_mode[1][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++; else //if( h->mb.i_type == I_4x4 ) for( int i = 0; i < 16; i++ ) h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++; h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++; } h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED; } /* calculate deblock strength values (actual deblocking is done per-row along with hpel) */ if( b_deblock ) x264_macroblock_deblock_strength( h ); if( mb_xy == h->sh.i_last_mb ) break; if( SLICE_MBAFF ) { i_mb_x += i_mb_y & 1; i_mb_y ^= i_mb_x < h->mb.i_mb_width; } else i_mb_x++; if( i_mb_x == h->mb.i_mb_width ) { i_mb_y++; i_mb_x = 0; } } if( h->sh.i_last_mb < h->sh.i_first_mb ) return 0; h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb; if( h->param.b_cabac ) { x264_cabac_encode_flush( h, &h->cabac ); h->out.bs.p = h->cabac.p; } else { if( i_skip > 0 ) bs_write_ue( &h->out.bs, i_skip ); /* last skip run */ /* rbsp_slice_trailing_bits */ bs_rbsp_trailing( &h->out.bs ); bs_flush( &h->out.bs ); } if( nal_end( h ) ) return -1; if( h->sh.i_last_mb == (h->i_threadslice_end * h->mb.i_mb_width - 1) ) { h->stat.frame.i_misc_bits = bs_pos( &h->out.bs ) + (h->out.i_nal*NALU_OVERHEAD * 8) - h->stat.frame.i_tex_bits - h->stat.frame.i_mv_bits; fdec_filter_row( h, h->i_threadslice_end, 0 ); if( h->param.b_sliced_threads ) { /* Tell the main thread we're done. */ x264_threadslice_cond_broadcast( h, 1 ); /* Do hpel now */ for( int mb_y = h->i_threadslice_start; mb_y <= h->i_threadslice_end; mb_y++ ) fdec_filter_row( h, mb_y, 1 ); x264_threadslice_cond_broadcast( h, 2 ); /* Do the first row of hpel, now that the previous slice is done */ if( h->i_thread_idx > 0 ) { x264_threadslice_cond_wait( h->thread[h->i_thread_idx-1], 2 ); fdec_filter_row( h, h->i_threadslice_start + (1 << SLICE_MBAFF), 2 ); } } /* Free mb info after the last thread's done using it */ if( h->fdec->mb_info_free && (!h->param.b_sliced_threads || h->i_thread_idx == (h->param.i_threads-1)) ) { h->fdec->mb_info_free( h->fdec->mb_info ); h->fdec->mb_info = NULL; h->fdec->mb_info_free = NULL; } } return 0; } static void thread_sync_context( x264_t *dst, x264_t *src ) { if( dst == src ) return; // reference counting for( x264_frame_t **f = src->frames.reference; *f; f++ ) (*f)->i_reference_count++; for( x264_frame_t **f = dst->frames.reference; *f; f++ ) x264_frame_push_unused( src, *f ); src->fdec->i_reference_count++; x264_frame_push_unused( src, dst->fdec ); // copy everything except the per-thread pointers and the constants. memcpy( &dst->i_frame, &src->i_frame, offsetof(x264_t, mb.base) - offsetof(x264_t, i_frame) ); dst->param = src->param; dst->stat = src->stat; dst->pixf = src->pixf; dst->reconfig = src->reconfig; } static void thread_sync_stat( x264_t *dst, x264_t *src ) { if( dst != src ) memcpy( &dst->stat, &src->stat, offsetof(x264_t, stat.frame) - offsetof(x264_t, stat) ); } static void *slices_write( x264_t *h ) { int i_slice_num = 0; int last_thread_mb = h->sh.i_last_mb; int round_bias = h->param.i_avcintra_class ? 0 : h->param.i_slice_count/2; /* init stats */ memset( &h->stat.frame, 0, sizeof(h->stat.frame) ); h->mb.b_reencode_mb = 0; while( h->sh.i_first_mb + SLICE_MBAFF*h->mb.i_mb_stride <= last_thread_mb ) { h->sh.i_last_mb = last_thread_mb; if( !i_slice_num || !x264_frame_new_slice( h, h->fdec ) ) { if( h->param.i_slice_max_mbs ) { if( SLICE_MBAFF ) { // convert first to mbaff form, add slice-max-mbs, then convert back to normal form int last_mbaff = 2*(h->sh.i_first_mb % h->mb.i_mb_width) + h->mb.i_mb_width*(h->sh.i_first_mb / h->mb.i_mb_width) + h->param.i_slice_max_mbs - 1; int last_x = (last_mbaff % (2*h->mb.i_mb_width))/2; int last_y = (last_mbaff / (2*h->mb.i_mb_width))*2 + 1; h->sh.i_last_mb = last_x + h->mb.i_mb_stride*last_y; } else { h->sh.i_last_mb = h->sh.i_first_mb + h->param.i_slice_max_mbs - 1; if( h->sh.i_last_mb < last_thread_mb && last_thread_mb - h->sh.i_last_mb < h->param.i_slice_min_mbs ) h->sh.i_last_mb = last_thread_mb - h->param.i_slice_min_mbs; } i_slice_num++; } else if( h->param.i_slice_count && !h->param.b_sliced_threads ) { int height = h->mb.i_mb_height >> PARAM_INTERLACED; int width = h->mb.i_mb_width << PARAM_INTERLACED; i_slice_num++; h->sh.i_last_mb = (height * i_slice_num + round_bias) / h->param.i_slice_count * width - 1; } } h->sh.i_last_mb = X264_MIN( h->sh.i_last_mb, last_thread_mb ); if( slice_write( h ) ) goto fail; h->sh.i_first_mb = h->sh.i_last_mb + 1; // if i_first_mb is not the last mb in a row then go to the next mb in MBAFF order if( SLICE_MBAFF && h->sh.i_first_mb % h->mb.i_mb_width ) h->sh.i_first_mb -= h->mb.i_mb_stride; } return (void *)0; fail: /* Tell other threads we're done, so they wouldn't wait for it */ if( h->param.b_sliced_threads ) x264_threadslice_cond_broadcast( h, 2 ); return (void *)-1; } static int threaded_slices_write( x264_t *h ) { int round_bias = h->param.i_avcintra_class ? 0 : h->param.i_slice_count/2; /* set first/last mb and sync contexts */ for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; if( i ) { t->param = h->param; memcpy( &t->i_frame, &h->i_frame, offsetof(x264_t, rc) - offsetof(x264_t, i_frame) ); } int height = h->mb.i_mb_height >> PARAM_INTERLACED; t->i_threadslice_start = ((height * i + round_bias) / h->param.i_threads) << PARAM_INTERLACED; t->i_threadslice_end = ((height * (i+1) + round_bias) / h->param.i_threads) << PARAM_INTERLACED; t->sh.i_first_mb = t->i_threadslice_start * h->mb.i_mb_width; t->sh.i_last_mb = t->i_threadslice_end * h->mb.i_mb_width - 1; } x264_analyse_weight_frame( h, h->mb.i_mb_height*16 + 16 ); x264_threads_distribute_ratecontrol( h ); /* setup */ for( int i = 0; i < h->param.i_threads; i++ ) { h->thread[i]->i_thread_idx = i; h->thread[i]->b_thread_active = 1; x264_threadslice_cond_broadcast( h->thread[i], 0 ); } /* dispatch */ for( int i = 0; i < h->param.i_threads; i++ ) x264_threadpool_run( h->threadpool, (void*)slices_write, h->thread[i] ); /* wait */ for( int i = 0; i < h->param.i_threads; i++ ) x264_threadslice_cond_wait( h->thread[i], 1 ); x264_threads_merge_ratecontrol( h ); for( int i = 1; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; for( int j = 0; j < t->out.i_nal; j++ ) { h->out.nal[h->out.i_nal] = t->out.nal[j]; h->out.i_nal++; nal_check_buffer( h ); } /* All entries in stat.frame are ints except for ssd/ssim. */ for( size_t j = 0; j < (offsetof(x264_t,stat.frame.i_ssd) - offsetof(x264_t,stat.frame.i_mv_bits)) / sizeof(int); j++ ) ((int*)&h->stat.frame)[j] += ((int*)&t->stat.frame)[j]; for( int j = 0; j < 3; j++ ) h->stat.frame.i_ssd[j] += t->stat.frame.i_ssd[j]; h->stat.frame.f_ssim += t->stat.frame.f_ssim; h->stat.frame.i_ssim_cnt += t->stat.frame.i_ssim_cnt; } return 0; } void x264_encoder_intra_refresh( x264_t *h ) { h = h->thread[h->i_thread_phase]; h->b_queued_intra_refresh = 1; } int x264_encoder_invalidate_reference( x264_t *h, int64_t pts ) { if( h->param.i_bframe ) { x264_log( h, X264_LOG_ERROR, "x264_encoder_invalidate_reference is not supported with B-frames enabled\n" ); return -1; } if( h->param.b_intra_refresh ) { x264_log( h, X264_LOG_ERROR, "x264_encoder_invalidate_reference is not supported with intra refresh enabled\n" ); return -1; } h = h->thread[h->i_thread_phase]; if( pts >= h->i_last_idr_pts ) { for( int i = 0; h->frames.reference[i]; i++ ) if( pts <= h->frames.reference[i]->i_pts ) h->frames.reference[i]->b_corrupt = 1; if( pts <= h->fdec->i_pts ) h->fdec->b_corrupt = 1; } return 0; } /**************************************************************************** * x264_encoder_encode: * XXX: i_poc : is the poc of the current given picture * i_frame : is the number of the frame being coded * ex: type frame poc * I 0 2*0 * P 1 2*3 * B 2 2*1 * B 3 2*2 * P 4 2*6 * B 5 2*4 * B 6 2*5 ****************************************************************************/ int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ) { x264_t *thread_current, *thread_prev, *thread_oldest; int i_nal_type, i_nal_ref_idc, i_global_qp; int overhead = NALU_OVERHEAD; #if HAVE_OPENCL if( h->opencl.b_fatal_error ) return -1; #endif if( h->i_thread_frames > 1 ) { thread_prev = h->thread[ h->i_thread_phase ]; h->i_thread_phase = (h->i_thread_phase + 1) % h->i_thread_frames; thread_current = h->thread[ h->i_thread_phase ]; thread_oldest = h->thread[ (h->i_thread_phase + 1) % h->i_thread_frames ]; thread_sync_context( thread_current, thread_prev ); x264_thread_sync_ratecontrol( thread_current, thread_prev, thread_oldest ); h = thread_current; } else { thread_current = thread_oldest = h; } h->i_cpb_delay_pir_offset = h->i_cpb_delay_pir_offset_next; /* no data out */ *pi_nal = 0; *pp_nal = NULL; /* ------------------- Setup new frame from picture -------------------- */ if( pic_in != NULL ) { if( h->lookahead->b_exit_thread ) { x264_log( h, X264_LOG_ERROR, "lookahead thread is already stopped\n" ); return -1; } /* 1: Copy the picture to a frame and move it to a buffer */ x264_frame_t *fenc = x264_frame_pop_unused( h, 0 ); if( !fenc ) return -1; if( x264_frame_copy_picture( h, fenc, pic_in ) < 0 ) return -1; if( h->param.i_width != 16 * h->mb.i_mb_width || h->param.i_height != 16 * h->mb.i_mb_height ) x264_frame_expand_border_mod16( h, fenc ); fenc->i_frame = h->frames.i_input++; if( fenc->i_frame == 0 ) h->frames.i_first_pts = fenc->i_pts; if( h->frames.i_bframe_delay && fenc->i_frame == h->frames.i_bframe_delay ) h->frames.i_bframe_delay_time = fenc->i_pts - h->frames.i_first_pts; if( h->param.b_vfr_input && fenc->i_pts <= h->frames.i_largest_pts ) x264_log( h, X264_LOG_WARNING, "non-strictly-monotonic PTS\n" ); h->frames.i_second_largest_pts = h->frames.i_largest_pts; h->frames.i_largest_pts = fenc->i_pts; if( (fenc->i_pic_struct < PIC_STRUCT_AUTO) || (fenc->i_pic_struct > PIC_STRUCT_TRIPLE) ) fenc->i_pic_struct = PIC_STRUCT_AUTO; if( fenc->i_pic_struct == PIC_STRUCT_AUTO ) { #if HAVE_INTERLACED int b_interlaced = fenc->param ? fenc->param->b_interlaced : h->param.b_interlaced; #else int b_interlaced = 0; #endif if( b_interlaced ) { int b_tff = fenc->param ? fenc->param->b_tff : h->param.b_tff; fenc->i_pic_struct = b_tff ? PIC_STRUCT_TOP_BOTTOM : PIC_STRUCT_BOTTOM_TOP; } else fenc->i_pic_struct = PIC_STRUCT_PROGRESSIVE; } if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read ) { if( x264_macroblock_tree_read( h, fenc, pic_in->prop.quant_offsets ) ) return -1; } else x264_adaptive_quant_frame( h, fenc, pic_in->prop.quant_offsets ); if( pic_in->prop.quant_offsets_free ) pic_in->prop.quant_offsets_free( pic_in->prop.quant_offsets ); if( h->frames.b_have_lowres ) x264_frame_init_lowres( h, fenc ); /* 2: Place the frame into the queue for its slice type decision */ x264_lookahead_put_frame( h, fenc ); if( h->frames.i_input <= h->frames.i_delay + 1 - h->i_thread_frames ) { /* Nothing yet to encode, waiting for filling of buffers */ pic_out->i_type = X264_TYPE_AUTO; return 0; } } else { /* signal kills for lookahead thread */ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); h->lookahead->b_exit_thread = 1; x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill ); x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); } h->i_frame++; /* 3: The picture is analyzed in the lookahead */ if( !h->frames.current[0] ) x264_lookahead_get_frames( h ); if( !h->frames.current[0] && x264_lookahead_is_empty( h ) ) return encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ); /* ------------------- Get frame to be encoded ------------------------- */ /* 4: get picture to encode */ h->fenc = x264_frame_shift( h->frames.current ); /* If applicable, wait for previous frame reconstruction to finish */ if( h->param.b_sliced_threads ) if( threadpool_wait_all( h ) < 0 ) return -1; if( h->i_frame == 0 ) h->i_reordered_pts_delay = h->fenc->i_reordered_pts; if( h->reconfig ) { x264_encoder_reconfig_apply( h, &h->reconfig_h->param ); h->reconfig = 0; } if( h->fenc->param ) { x264_encoder_reconfig_apply( h, h->fenc->param ); if( h->fenc->param->param_free ) { x264_param_cleanup( h->fenc->param ); h->fenc->param->param_free( h->fenc->param ); h->fenc->param = NULL; } } x264_ratecontrol_zone_init( h ); // ok to call this before encoding any frames, since the initial values of fdec have b_kept_as_ref=0 if( reference_update( h ) ) return -1; h->fdec->i_lines_completed = -1; if( !IS_X264_TYPE_I( h->fenc->i_type ) ) { int valid_refs_left = 0; for( int i = 0; h->frames.reference[i]; i++ ) if( !h->frames.reference[i]->b_corrupt ) valid_refs_left++; /* No valid reference frames left: force an IDR. */ if( !valid_refs_left ) { h->fenc->b_keyframe = 1; h->fenc->i_type = X264_TYPE_IDR; } } if( h->fenc->b_keyframe ) { h->frames.i_last_keyframe = h->fenc->i_frame; if( h->fenc->i_type == X264_TYPE_IDR ) { h->i_frame_num = 0; h->frames.i_last_idr = h->fenc->i_frame; } } h->sh.i_mmco_command_count = h->sh.i_mmco_remove_from_end = 0; h->b_ref_reorder[0] = h->b_ref_reorder[1] = 0; h->fdec->i_poc = h->fenc->i_poc = 2 * ( h->fenc->i_frame - X264_MAX( h->frames.i_last_idr, 0 ) ); /* ------------------- Setup frame context ----------------------------- */ /* 5: Init data dependent of frame type */ if( h->fenc->i_type == X264_TYPE_IDR ) { /* reset ref pictures */ i_nal_type = NAL_SLICE_IDR; i_nal_ref_idc = NAL_PRIORITY_HIGHEST; h->sh.i_type = SLICE_TYPE_I; reference_reset( h ); h->frames.i_poc_last_open_gop = -1; } else if( h->fenc->i_type == X264_TYPE_I ) { i_nal_type = NAL_SLICE; i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/ h->sh.i_type = SLICE_TYPE_I; reference_hierarchy_reset( h ); if( h->param.b_open_gop ) h->frames.i_poc_last_open_gop = h->fenc->b_keyframe ? h->fenc->i_poc : -1; } else if( h->fenc->i_type == X264_TYPE_P ) { i_nal_type = NAL_SLICE; i_nal_ref_idc = NAL_PRIORITY_HIGH; /* Not completely true but for now it is (as all I/P are kept as ref)*/ h->sh.i_type = SLICE_TYPE_P; reference_hierarchy_reset( h ); h->frames.i_poc_last_open_gop = -1; } else if( h->fenc->i_type == X264_TYPE_BREF ) { i_nal_type = NAL_SLICE; i_nal_ref_idc = h->param.i_bframe_pyramid == X264_B_PYRAMID_STRICT ? NAL_PRIORITY_LOW : NAL_PRIORITY_HIGH; h->sh.i_type = SLICE_TYPE_B; reference_hierarchy_reset( h ); } else /* B frame */ { i_nal_type = NAL_SLICE; i_nal_ref_idc = NAL_PRIORITY_DISPOSABLE; h->sh.i_type = SLICE_TYPE_B; } h->fdec->i_type = h->fenc->i_type; h->fdec->i_frame = h->fenc->i_frame; h->fenc->b_kept_as_ref = h->fdec->b_kept_as_ref = i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE && h->param.i_keyint_max > 1; h->fdec->mb_info = h->fenc->mb_info; h->fdec->mb_info_free = h->fenc->mb_info_free; h->fenc->mb_info = NULL; h->fenc->mb_info_free = NULL; h->fdec->i_pts = h->fenc->i_pts; if( h->frames.i_bframe_delay ) { int64_t *prev_reordered_pts = thread_current->frames.i_prev_reordered_pts; h->fdec->i_dts = h->i_frame > h->frames.i_bframe_delay ? prev_reordered_pts[ (h->i_frame - h->frames.i_bframe_delay) % h->frames.i_bframe_delay ] : h->fenc->i_reordered_pts - h->frames.i_bframe_delay_time; prev_reordered_pts[ h->i_frame % h->frames.i_bframe_delay ] = h->fenc->i_reordered_pts; } else h->fdec->i_dts = h->fenc->i_reordered_pts; if( h->fenc->i_type == X264_TYPE_IDR ) h->i_last_idr_pts = h->fdec->i_pts; /* ------------------- Init ----------------------------- */ /* build ref list 0/1 */ reference_build_list( h, h->fdec->i_poc ); /* ---------------------- Write the bitstream -------------------------- */ /* Init bitstream context */ if( h->param.b_sliced_threads ) { for( int i = 0; i < h->param.i_threads; i++ ) { bs_init( &h->thread[i]->out.bs, h->thread[i]->out.p_bitstream, h->thread[i]->out.i_bitstream ); h->thread[i]->out.i_nal = 0; } } else { bs_init( &h->out.bs, h->out.p_bitstream, h->out.i_bitstream ); h->out.i_nal = 0; } if( h->param.b_aud ) { int pic_type; if( h->sh.i_type == SLICE_TYPE_I ) pic_type = 0; else if( h->sh.i_type == SLICE_TYPE_P ) pic_type = 1; else if( h->sh.i_type == SLICE_TYPE_B ) pic_type = 2; else pic_type = 7; nal_start( h, NAL_AUD, NAL_PRIORITY_DISPOSABLE ); bs_write( &h->out.bs, 3, pic_type ); bs_rbsp_trailing( &h->out.bs ); bs_flush( &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD; } h->i_nal_type = i_nal_type; h->i_nal_ref_idc = i_nal_ref_idc; if( h->param.b_intra_refresh ) { if( IS_X264_TYPE_I( h->fenc->i_type ) ) { h->fdec->i_frames_since_pir = 0; h->b_queued_intra_refresh = 0; /* PIR is currently only supported with ref == 1, so any intra frame effectively refreshes * the whole frame and counts as an intra refresh. */ h->fdec->f_pir_position = h->mb.i_mb_width; } else if( h->fenc->i_type == X264_TYPE_P ) { int pocdiff = (h->fdec->i_poc - h->fref[0][0]->i_poc)/2; float increment = X264_MAX( ((float)h->mb.i_mb_width-1) / h->param.i_keyint_max, 1 ); h->fdec->f_pir_position = h->fref[0][0]->f_pir_position; h->fdec->i_frames_since_pir = h->fref[0][0]->i_frames_since_pir + pocdiff; if( h->fdec->i_frames_since_pir >= h->param.i_keyint_max || (h->b_queued_intra_refresh && h->fdec->f_pir_position + 0.5 >= h->mb.i_mb_width) ) { h->fdec->f_pir_position = 0; h->fdec->i_frames_since_pir = 0; h->b_queued_intra_refresh = 0; h->fenc->b_keyframe = 1; } h->fdec->i_pir_start_col = h->fdec->f_pir_position+0.5; h->fdec->f_pir_position += increment * pocdiff; h->fdec->i_pir_end_col = h->fdec->f_pir_position+0.5; /* If our intra refresh has reached the right side of the frame, we're done. */ if( h->fdec->i_pir_end_col >= h->mb.i_mb_width - 1 ) { h->fdec->f_pir_position = h->mb.i_mb_width; h->fdec->i_pir_end_col = h->mb.i_mb_width - 1; } } } if( h->fenc->b_keyframe ) { /* Write SPS and PPS */ if( h->param.b_repeat_headers ) { /* generate sequence parameters */ nal_start( h, NAL_SPS, NAL_PRIORITY_HIGHEST ); x264_sps_write( &h->out.bs, h->sps ); if( nal_end( h ) ) return -1; /* Pad AUD/SPS to 256 bytes like Panasonic */ if( h->param.i_avcintra_class ) h->out.nal[h->out.i_nal-1].i_padding = 256 - bs_pos( &h->out.bs ) / 8 - 2*NALU_OVERHEAD; overhead += h->out.nal[h->out.i_nal-1].i_payload + h->out.nal[h->out.i_nal-1].i_padding + NALU_OVERHEAD; /* generate picture parameters */ nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST ); x264_pps_write( &h->out.bs, h->sps, h->pps ); if( nal_end( h ) ) return -1; if( h->param.i_avcintra_class ) { int total_len = 256; /* Sony XAVC uses an oversized PPS instead of SEI padding */ if( h->param.i_avcintra_flavor == X264_AVCINTRA_FLAVOR_SONY ) total_len += h->param.i_height >= 1080 ? 18*512 : 10*512; h->out.nal[h->out.i_nal-1].i_padding = total_len - h->out.nal[h->out.i_nal-1].i_payload - NALU_OVERHEAD; } overhead += h->out.nal[h->out.i_nal-1].i_payload + h->out.nal[h->out.i_nal-1].i_padding + NALU_OVERHEAD; } /* when frame threading is used, buffering period sei is written in encoder_frame_end */ if( h->i_thread_frames == 1 && h->sps->vui.b_nal_hrd_parameters_present ) { x264_hrd_fullness( h ); nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_buffering_period_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } } /* write extra sei */ for( int i = 0; i < h->fenc->extra_sei.num_payloads; i++ ) { nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_write( &h->out.bs, h->fenc->extra_sei.payloads[i].payload, h->fenc->extra_sei.payloads[i].payload_size, h->fenc->extra_sei.payloads[i].payload_type ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; if( h->fenc->extra_sei.sei_free ) { h->fenc->extra_sei.sei_free( h->fenc->extra_sei.payloads[i].payload ); h->fenc->extra_sei.payloads[i].payload = NULL; } } if( h->fenc->extra_sei.sei_free ) { h->fenc->extra_sei.sei_free( h->fenc->extra_sei.payloads ); h->fenc->extra_sei.payloads = NULL; h->fenc->extra_sei.sei_free = NULL; } if( h->fenc->b_keyframe ) { /* Avid's decoder strictly wants two SEIs for AVC-Intra so we can't insert the x264 SEI */ if( h->param.b_repeat_headers && h->fenc->i_frame == 0 && !h->param.i_avcintra_class ) { /* identify ourself */ nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); if( x264_sei_version_write( h, &h->out.bs ) ) return -1; if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } if( h->fenc->i_type != X264_TYPE_IDR ) { int time_to_recovery = h->param.b_open_gop ? 0 : X264_MIN( h->mb.i_mb_width - 1, h->param.i_keyint_max ) + h->param.i_bframe - 1; nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_recovery_point_write( h, &h->out.bs, time_to_recovery ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } if( h->param.mastering_display.b_mastering_display ) { nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_mastering_display_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } if( h->param.content_light_level.b_cll ) { nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_content_light_level_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } if( h->param.i_alternative_transfer != 2 ) { nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_alternative_transfer_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } } if( h->param.i_frame_packing >= 0 && (h->fenc->b_keyframe || h->param.i_frame_packing == 5) ) { nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_frame_packing_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } /* generate sei pic timing */ if( h->sps->vui.b_pic_struct_present || h->sps->vui.b_nal_hrd_parameters_present ) { nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_pic_timing_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } /* As required by Blu-ray. */ if( !IS_X264_TYPE_B( h->fenc->i_type ) && h->b_sh_backup ) { h->b_sh_backup = 0; nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_dec_ref_pic_marking_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; } if( h->fenc->b_keyframe && h->param.b_intra_refresh ) h->i_cpb_delay_pir_offset_next = h->fenc->i_cpb_delay; /* Filler space: 10 or 18 SEIs' worth of space, depending on resolution */ if( h->param.i_avcintra_class && h->param.i_avcintra_flavor != X264_AVCINTRA_FLAVOR_SONY ) { /* Write an empty filler NAL to mimic the AUD in the P2 format*/ nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE ); x264_filler_write( h, &h->out.bs, 0 ); if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + NALU_OVERHEAD; /* All lengths are magic lengths that decoders expect to see */ /* "UMID" SEI */ nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); if( x264_sei_avcintra_umid_write( h, &h->out.bs ) < 0 ) return -1; if( nal_end( h ) ) return -1; overhead += h->out.nal[h->out.i_nal-1].i_payload + SEI_OVERHEAD; int unpadded_len; int total_len; if( h->param.i_height == 1080 ) { unpadded_len = 5780; total_len = 17*512; } else { unpadded_len = 2900; total_len = 9*512; } /* "VANC" SEI */ nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); if( x264_sei_avcintra_vanc_write( h, &h->out.bs, unpadded_len ) < 0 ) return -1; if( nal_end( h ) ) return -1; h->out.nal[h->out.i_nal-1].i_padding = total_len - h->out.nal[h->out.i_nal-1].i_payload - SEI_OVERHEAD; overhead += h->out.nal[h->out.i_nal-1].i_payload + h->out.nal[h->out.i_nal-1].i_padding + SEI_OVERHEAD; } /* Init the rate control */ /* FIXME: Include slice header bit cost. */ x264_ratecontrol_start( h, h->fenc->i_qpplus1, overhead*8 ); i_global_qp = x264_ratecontrol_qp( h ); pic_out->i_qpplus1 = h->fdec->i_qpplus1 = i_global_qp + 1; if( h->param.rc.b_stat_read && h->sh.i_type != SLICE_TYPE_I ) { x264_reference_build_list_optimal( h ); reference_check_reorder( h ); } if( h->i_ref[0] ) h->fdec->i_poc_l0ref0 = h->fref[0][0]->i_poc; /* ------------------------ Create slice header ----------------------- */ slice_init( h, i_nal_type, i_global_qp ); /*------------------------- Weights -------------------------------------*/ if( h->sh.i_type == SLICE_TYPE_B ) x264_macroblock_bipred_init( h ); weighted_pred_init( h ); if( i_nal_ref_idc != NAL_PRIORITY_DISPOSABLE ) h->i_frame_num++; /* Write frame */ h->i_threadslice_start = 0; h->i_threadslice_end = h->mb.i_mb_height; if( h->i_thread_frames > 1 ) { x264_threadpool_run( h->threadpool, (void*)slices_write, h ); h->b_thread_active = 1; } else if( h->param.b_sliced_threads ) { if( threaded_slices_write( h ) ) return -1; } else if( (intptr_t)slices_write( h ) ) return -1; return encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ); } static int encoder_frame_end( x264_t *h, x264_t *thread_current, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_out ) { char psz_message[80]; if( !h->param.b_sliced_threads && h->b_thread_active ) { h->b_thread_active = 0; if( (intptr_t)x264_threadpool_wait( h->threadpool, h ) ) return -1; } if( !h->out.i_nal ) { pic_out->i_type = X264_TYPE_AUTO; return 0; } x264_emms(); /* generate buffering period sei and insert it into place */ if( h->i_thread_frames > 1 && h->fenc->b_keyframe && h->sps->vui.b_nal_hrd_parameters_present ) { x264_hrd_fullness( h ); nal_start( h, NAL_SEI, NAL_PRIORITY_DISPOSABLE ); x264_sei_buffering_period_write( h, &h->out.bs ); if( nal_end( h ) ) return -1; /* buffering period sei must follow AUD, SPS and PPS and precede all other SEIs */ int idx = 0; while( h->out.nal[idx].i_type == NAL_AUD || h->out.nal[idx].i_type == NAL_SPS || h->out.nal[idx].i_type == NAL_PPS ) idx++; x264_nal_t nal_tmp = h->out.nal[h->out.i_nal-1]; memmove( &h->out.nal[idx+1], &h->out.nal[idx], (h->out.i_nal-idx-1)*sizeof(x264_nal_t) ); h->out.nal[idx] = nal_tmp; } int frame_size = encoder_encapsulate_nals( h, 0 ); if( frame_size < 0 ) return -1; /* Set output picture properties */ pic_out->i_type = h->fenc->i_type; pic_out->b_keyframe = h->fenc->b_keyframe; pic_out->i_pic_struct = h->fenc->i_pic_struct; pic_out->i_pts = h->fdec->i_pts; pic_out->i_dts = h->fdec->i_dts; if( pic_out->i_pts < pic_out->i_dts ) x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" ); pic_out->opaque = h->fenc->opaque; pic_out->img.i_csp = h->fdec->i_csp; #if HIGH_BIT_DEPTH pic_out->img.i_csp |= X264_CSP_HIGH_DEPTH; #endif pic_out->img.i_plane = h->fdec->i_plane; for( int i = 0; i < pic_out->img.i_plane; i++ ) { pic_out->img.i_stride[i] = h->fdec->i_stride[i] * SIZEOF_PIXEL; pic_out->img.plane[i] = (uint8_t*)h->fdec->plane[i]; } x264_frame_push_unused( thread_current, h->fenc ); /* ---------------------- Update encoder state ------------------------- */ /* update rc */ int filler = 0; if( x264_ratecontrol_end( h, frame_size * 8, &filler ) < 0 ) return -1; pic_out->hrd_timing = h->fenc->hrd_timing; pic_out->prop.f_crf_avg = h->fdec->f_crf_avg; /* Filler in AVC-Intra mode is written as zero bytes to the last slice * We don't know the size of the last slice until encapsulation so we add filler to the encapsulated NAL */ if( h->param.i_avcintra_class ) { if( check_encapsulated_buffer( h, h->thread[0], h->out.i_nal, frame_size, (int64_t)frame_size + filler ) < 0 ) return -1; x264_nal_t *nal = &h->out.nal[h->out.i_nal-1]; memset( nal->p_payload + nal->i_payload, 0, filler ); nal->i_payload += filler; nal->i_padding = filler; frame_size += filler; /* Fix up the size header for mp4/etc */ if( !h->param.b_annexb ) { /* Size doesn't include the size of the header we're writing now. */ uint8_t *nal_data = nal->p_payload; int chunk_size = nal->i_payload - 4; nal_data[0] = chunk_size >> 24; nal_data[1] = chunk_size >> 16; nal_data[2] = chunk_size >> 8; nal_data[3] = chunk_size >> 0; } } else { while( filler > 0 ) { int f, overhead = FILLER_OVERHEAD - h->param.b_annexb; if( h->param.i_slice_max_size && filler > h->param.i_slice_max_size ) { int next_size = filler - h->param.i_slice_max_size; int overflow = X264_MAX( overhead - next_size, 0 ); f = h->param.i_slice_max_size - overhead - overflow; } else f = X264_MAX( 0, filler - overhead ); if( bitstream_check_buffer_filler( h, f ) ) return -1; nal_start( h, NAL_FILLER, NAL_PRIORITY_DISPOSABLE ); x264_filler_write( h, &h->out.bs, f ); if( nal_end( h ) ) return -1; int total_size = encoder_encapsulate_nals( h, h->out.i_nal-1 ); if( total_size < 0 ) return -1; frame_size += total_size; filler -= total_size; } } /* End bitstream, set output */ *pi_nal = h->out.i_nal; *pp_nal = h->out.nal; h->out.i_nal = 0; x264_noise_reduction_update( h ); /* ---------------------- Compute/Print statistics --------------------- */ thread_sync_stat( h, h->thread[0] ); /* Slice stat */ h->stat.i_frame_count[h->sh.i_type]++; h->stat.i_frame_size[h->sh.i_type] += frame_size; h->stat.f_frame_qp[h->sh.i_type] += h->fdec->f_qp_avg_aq; for( int i = 0; i < X264_MBTYPE_MAX; i++ ) h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i]; for( int i = 0; i < 2; i++ ) h->stat.i_mb_count_8x8dct[i] += h->stat.frame.i_mb_count_8x8dct[i]; for( int i = 0; i < 6; i++ ) h->stat.i_mb_cbp[i] += h->stat.frame.i_mb_cbp[i]; for( int i = 0; i < 4; i++ ) for( int j = 0; j < 13; j++ ) h->stat.i_mb_pred_mode[i][j] += h->stat.frame.i_mb_pred_mode[i][j]; if( h->sh.i_type != SLICE_TYPE_I ) { for( int i = 0; i < X264_PARTTYPE_MAX; i++ ) h->stat.i_mb_partition[h->sh.i_type][i] += h->stat.frame.i_mb_partition[i]; for( int i_list = 0; i_list < 2; i_list++ ) for( int i = 0; i < X264_REF_MAX*2; i++ ) h->stat.i_mb_count_ref[h->sh.i_type][i_list][i] += h->stat.frame.i_mb_count_ref[i_list][i]; } for( int i = 0; i < 3; i++ ) h->stat.i_mb_field[i] += h->stat.frame.i_mb_field[i]; if( h->sh.i_type == SLICE_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE ) { h->stat.i_wpred[0] += !!h->sh.weight[0][0].weightfn; h->stat.i_wpred[1] += !!h->sh.weight[0][1].weightfn || !!h->sh.weight[0][2].weightfn; } if( h->sh.i_type == SLICE_TYPE_B ) { h->stat.i_direct_frames[ h->sh.b_direct_spatial_mv_pred ] ++; if( h->mb.b_direct_auto_write ) { //FIXME somewhat arbitrary time constants if( h->stat.i_direct_score[0] + h->stat.i_direct_score[1] > h->mb.i_mb_count ) for( int i = 0; i < 2; i++ ) h->stat.i_direct_score[i] = h->stat.i_direct_score[i] * 9/10; for( int i = 0; i < 2; i++ ) h->stat.i_direct_score[i] += h->stat.frame.i_direct_score[i]; } } else h->stat.i_consecutive_bframes[h->fenc->i_bframes]++; psz_message[0] = '\0'; double dur = h->fenc->f_duration; h->stat.f_frame_duration[h->sh.i_type] += dur; if( h->param.analyse.b_psnr ) { int64_t ssd[3] = { h->stat.frame.i_ssd[0], h->stat.frame.i_ssd[1], h->stat.frame.i_ssd[2], }; int luma_size = h->param.i_width * h->param.i_height; int chroma_size = CHROMA_SIZE( luma_size ); pic_out->prop.f_psnr[0] = calc_psnr( ssd[0], luma_size ); pic_out->prop.f_psnr[1] = calc_psnr( ssd[1], chroma_size ); pic_out->prop.f_psnr[2] = calc_psnr( ssd[2], chroma_size ); pic_out->prop.f_psnr_avg = calc_psnr( ssd[0] + ssd[1] + ssd[2], luma_size + chroma_size*2 ); h->stat.f_ssd_global[h->sh.i_type] += dur * (ssd[0] + ssd[1] + ssd[2]); h->stat.f_psnr_average[h->sh.i_type] += dur * pic_out->prop.f_psnr_avg; h->stat.f_psnr_mean_y[h->sh.i_type] += dur * pic_out->prop.f_psnr[0]; h->stat.f_psnr_mean_u[h->sh.i_type] += dur * pic_out->prop.f_psnr[1]; h->stat.f_psnr_mean_v[h->sh.i_type] += dur * pic_out->prop.f_psnr[2]; snprintf( psz_message, 80, " PSNR Y:%5.2f U:%5.2f V:%5.2f", pic_out->prop.f_psnr[0], pic_out->prop.f_psnr[1], pic_out->prop.f_psnr[2] ); } if( h->param.analyse.b_ssim ) { pic_out->prop.f_ssim = h->stat.frame.f_ssim / h->stat.frame.i_ssim_cnt; h->stat.f_ssim_mean_y[h->sh.i_type] += pic_out->prop.f_ssim * dur; int msg_len = strlen(psz_message); snprintf( psz_message + msg_len, 80 - msg_len, " SSIM Y:%.5f", pic_out->prop.f_ssim ); } psz_message[79] = '\0'; x264_log( h, X264_LOG_DEBUG, "frame=%4d QP=%.2f NAL=%d Slice:%c Poc:%-3d I:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n", h->i_frame, h->fdec->f_qp_avg_aq, h->i_nal_ref_idc, h->sh.i_type == SLICE_TYPE_I ? 'I' : (h->sh.i_type == SLICE_TYPE_P ? 'P' : 'B' ), h->fdec->i_poc, h->stat.frame.i_mb_count_i, h->stat.frame.i_mb_count_p, h->stat.frame.i_mb_count_skip, frame_size, psz_message ); // keep stats all in one place thread_sync_stat( h->thread[0], h ); // for the use of the next frame thread_sync_stat( thread_current, h ); #ifdef DEBUG_MB_TYPE { static const char mb_chars[] = { 'i', 'i', 'I', 'C', 'P', '8', 'S', 'D', '<', 'X', 'B', 'X', '>', 'B', 'B', 'B', 'B', '8', 'S' }; for( int mb_xy = 0; mb_xy < h->mb.i_mb_width * h->mb.i_mb_height; mb_xy++ ) { if( h->mb.type[mb_xy] < X264_MBTYPE_MAX && h->mb.type[mb_xy] >= 0 ) fprintf( stderr, "%c ", mb_chars[ h->mb.type[mb_xy] ] ); else fprintf( stderr, "? " ); if( (mb_xy+1) % h->mb.i_mb_width == 0 ) fprintf( stderr, "\n" ); } } #endif /* Remove duplicates, must be done near the end as breaks h->fref0 array * by freeing some of its pointers. */ for( int i = 0; i < h->i_ref[0]; i++ ) if( h->fref[0][i] && h->fref[0][i]->b_duplicate ) { x264_frame_push_blank_unused( h, h->fref[0][i] ); h->fref[0][i] = 0; } if( h->param.psz_dump_yuv ) frame_dump( h ); x264_emms(); return frame_size; } static void print_intra( int64_t *i_mb_count, double i_count, int b_print_pcm, char *intra ) { intra += sprintf( intra, "I16..4%s: %4.1f%% %4.1f%% %4.1f%%", b_print_pcm ? "..PCM" : "", i_mb_count[I_16x16]/ i_count, i_mb_count[I_8x8] / i_count, i_mb_count[I_4x4] / i_count ); if( b_print_pcm ) sprintf( intra, " %4.1f%%", i_mb_count[I_PCM] / i_count ); } /**************************************************************************** * x264_encoder_close: ****************************************************************************/ void x264_encoder_close ( x264_t *h ) { int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height ); int64_t i_mb_count_size[2][7] = {{0}}; char buf[200]; int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM] || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM] || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM]; x264_lookahead_delete( h ); #if HAVE_OPENCL x264_opencl_lookahead_delete( h ); x264_opencl_function_t *ocl = h->opencl.ocl; #endif if( h->param.b_sliced_threads ) threadpool_wait_all( h ); if( h->param.i_threads > 1 ) x264_threadpool_delete( h->threadpool ); if( h->param.i_lookahead_threads > 1 ) x264_threadpool_delete( h->lookaheadpool ); if( h->i_thread_frames > 1 ) { for( int i = 0; i < h->i_thread_frames; i++ ) if( h->thread[i]->b_thread_active ) { assert( h->thread[i]->fenc->i_reference_count == 1 ); x264_frame_delete( h->thread[i]->fenc ); } x264_t *thread_prev = h->thread[h->i_thread_phase]; x264_thread_sync_ratecontrol( h, thread_prev, h ); x264_thread_sync_ratecontrol( thread_prev, thread_prev, h ); h->i_frame = thread_prev->i_frame + 1 - h->i_thread_frames; } h->i_frame++; /* Slices used and PSNR */ for( int i = 0; i < 3; i++ ) { static const uint8_t slice_order[] = { SLICE_TYPE_I, SLICE_TYPE_P, SLICE_TYPE_B }; int i_slice = slice_order[i]; if( h->stat.i_frame_count[i_slice] > 0 ) { int i_count = h->stat.i_frame_count[i_slice]; double dur = h->stat.f_frame_duration[i_slice]; if( h->param.analyse.b_psnr ) { x264_log( h, X264_LOG_INFO, "frame %c:%-5d Avg QP:%5.2f size:%6.0f PSNR Mean Y:%5.2f U:%5.2f V:%5.2f Avg:%5.2f Global:%5.2f\n", slice_type_to_char[i_slice], i_count, h->stat.f_frame_qp[i_slice] / i_count, (double)h->stat.i_frame_size[i_slice] / i_count, h->stat.f_psnr_mean_y[i_slice] / dur, h->stat.f_psnr_mean_u[i_slice] / dur, h->stat.f_psnr_mean_v[i_slice] / dur, h->stat.f_psnr_average[i_slice] / dur, calc_psnr( h->stat.f_ssd_global[i_slice], dur * i_yuv_size ) ); } else { x264_log( h, X264_LOG_INFO, "frame %c:%-5d Avg QP:%5.2f size:%6.0f\n", slice_type_to_char[i_slice], i_count, h->stat.f_frame_qp[i_slice] / i_count, (double)h->stat.i_frame_size[i_slice] / i_count ); } } } if( h->param.i_bframe && h->stat.i_frame_count[SLICE_TYPE_B] ) { char *p = buf; int den = 0; // weight by number of frames (including the I/P-frames) that are in a sequence of N B-frames for( int i = 0; i <= h->param.i_bframe; i++ ) den += (i+1) * h->stat.i_consecutive_bframes[i]; for( int i = 0; i <= h->param.i_bframe; i++ ) p += sprintf( p, " %4.1f%%", 100. * (i+1) * h->stat.i_consecutive_bframes[i] / den ); x264_log( h, X264_LOG_INFO, "consecutive B-frames:%s\n", buf ); } for( int i_type = 0; i_type < 2; i_type++ ) for( int i = 0; i < X264_PARTTYPE_MAX; i++ ) { if( i == D_DIRECT_8x8 ) continue; /* direct is counted as its own type */ i_mb_count_size[i_type][x264_mb_partition_pixel_table[i]] += h->stat.i_mb_partition[i_type][i]; } /* MB types used */ if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 ) { int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I]; double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0; print_intra( i_mb_count, i_count, b_print_pcm, buf ); x264_log( h, X264_LOG_INFO, "mb I %s\n", buf ); } if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 ) { int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P]; double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0; int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P]; print_intra( i_mb_count, i_count, b_print_pcm, buf ); x264_log( h, X264_LOG_INFO, "mb P %s P16..4: %4.1f%% %4.1f%% %4.1f%% %4.1f%% %4.1f%% skip:%4.1f%%\n", buf, i_mb_size[PIXEL_16x16] / (i_count*4), (i_mb_size[PIXEL_16x8] + i_mb_size[PIXEL_8x16]) / (i_count*4), i_mb_size[PIXEL_8x8] / (i_count*4), (i_mb_size[PIXEL_8x4] + i_mb_size[PIXEL_4x8]) / (i_count*4), i_mb_size[PIXEL_4x4] / (i_count*4), i_mb_count[P_SKIP] / i_count ); } if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 ) { int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B]; double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0; double i_mb_list_count; int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B]; int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */ print_intra( i_mb_count, i_count, b_print_pcm, buf ); for( int i = 0; i < X264_PARTTYPE_MAX; i++ ) for( int j = 0; j < 2; j++ ) { int l0 = x264_mb_type_list_table[i][0][j]; int l1 = x264_mb_type_list_table[i][1][j]; if( l0 || l1 ) list_count[l1+l0*l1] += h->stat.i_mb_count[SLICE_TYPE_B][i] * 2; } list_count[0] += h->stat.i_mb_partition[SLICE_TYPE_B][D_L0_8x8]; list_count[1] += h->stat.i_mb_partition[SLICE_TYPE_B][D_L1_8x8]; list_count[2] += h->stat.i_mb_partition[SLICE_TYPE_B][D_BI_8x8]; i_mb_count[B_DIRECT] += (h->stat.i_mb_partition[SLICE_TYPE_B][D_DIRECT_8x8]+2)/4; i_mb_list_count = (list_count[0] + list_count[1] + list_count[2]) / 100.0; sprintf( buf + strlen(buf), " B16..8: %4.1f%% %4.1f%% %4.1f%% direct:%4.1f%% skip:%4.1f%%", i_mb_size[PIXEL_16x16] / (i_count*4), (i_mb_size[PIXEL_16x8] + i_mb_size[PIXEL_8x16]) / (i_count*4), i_mb_size[PIXEL_8x8] / (i_count*4), i_mb_count[B_DIRECT] / i_count, i_mb_count[B_SKIP] / i_count ); if( i_mb_list_count != 0 ) sprintf( buf + strlen(buf), " L0:%4.1f%% L1:%4.1f%% BI:%4.1f%%", list_count[0] / i_mb_list_count, list_count[1] / i_mb_list_count, list_count[2] / i_mb_list_count ); x264_log( h, X264_LOG_INFO, "mb B %s\n", buf ); } x264_ratecontrol_summary( h ); if( h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B] > 0 ) { #define SUM3(p) (p[SLICE_TYPE_I] + p[SLICE_TYPE_P] + p[SLICE_TYPE_B]) #define SUM3b(p,o) (p[SLICE_TYPE_I][o] + p[SLICE_TYPE_P][o] + p[SLICE_TYPE_B][o]) int64_t i_i8x8 = SUM3b( h->stat.i_mb_count, I_8x8 ); int64_t i_intra = i_i8x8 + SUM3b( h->stat.i_mb_count, I_4x4 ) + SUM3b( h->stat.i_mb_count, I_16x16 ); int64_t i_all_intra = i_intra + SUM3b( h->stat.i_mb_count, I_PCM ); int64_t i_skip = SUM3b( h->stat.i_mb_count, P_SKIP ) + SUM3b( h->stat.i_mb_count, B_SKIP ); const int i_count = h->stat.i_frame_count[SLICE_TYPE_I] + h->stat.i_frame_count[SLICE_TYPE_P] + h->stat.i_frame_count[SLICE_TYPE_B]; int64_t i_mb_count = (int64_t)i_count * h->mb.i_mb_count; int64_t i_inter = i_mb_count - i_skip - i_all_intra; const double duration = h->stat.f_frame_duration[SLICE_TYPE_I] + h->stat.f_frame_duration[SLICE_TYPE_P] + h->stat.f_frame_duration[SLICE_TYPE_B]; float f_bitrate = SUM3(h->stat.i_frame_size) / duration / 125; if( PARAM_INTERLACED ) { char *fieldstats = buf; fieldstats[0] = 0; if( i_inter ) fieldstats += sprintf( fieldstats, " inter:%.1f%%", h->stat.i_mb_field[1] * 100.0 / i_inter ); if( i_skip ) fieldstats += sprintf( fieldstats, " skip:%.1f%%", h->stat.i_mb_field[2] * 100.0 / i_skip ); x264_log( h, X264_LOG_INFO, "field mbs: intra: %.1f%%%s\n", h->stat.i_mb_field[0] * 100.0 / i_all_intra, buf ); } if( h->pps->b_transform_8x8_mode ) { buf[0] = 0; if( h->stat.i_mb_count_8x8dct[0] ) sprintf( buf, " inter:%.1f%%", 100. * h->stat.i_mb_count_8x8dct[1] / h->stat.i_mb_count_8x8dct[0] ); x264_log( h, X264_LOG_INFO, "8x8 transform intra:%.1f%%%s\n", 100. * i_i8x8 / X264_MAX( i_intra, 1 ), buf ); } if( (h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO || (h->stat.i_direct_frames[0] && h->stat.i_direct_frames[1])) && h->stat.i_frame_count[SLICE_TYPE_B] ) { x264_log( h, X264_LOG_INFO, "direct mvs spatial:%.1f%% temporal:%.1f%%\n", h->stat.i_direct_frames[1] * 100. / h->stat.i_frame_count[SLICE_TYPE_B], h->stat.i_direct_frames[0] * 100. / h->stat.i_frame_count[SLICE_TYPE_B] ); } buf[0] = 0; if( CHROMA_FORMAT ) { int csize = CHROMA444 ? 4 : 1; if( i_mb_count != i_all_intra ) sprintf( buf, " inter: %.1f%% %.1f%% %.1f%%", h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4), h->stat.i_mb_cbp[3] * 100.0 / ((i_mb_count - i_all_intra)*csize), h->stat.i_mb_cbp[5] * 100.0 / ((i_mb_count - i_all_intra)*csize) ); x264_log( h, X264_LOG_INFO, "coded y,%s,%s intra: %.1f%% %.1f%% %.1f%%%s\n", CHROMA444?"u":"uvDC", CHROMA444?"v":"uvAC", h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4), h->stat.i_mb_cbp[2] * 100.0 / (i_all_intra*csize), h->stat.i_mb_cbp[4] * 100.0 / (i_all_intra*csize), buf ); } else { if( i_mb_count != i_all_intra ) sprintf( buf, " inter: %.1f%%", h->stat.i_mb_cbp[1] * 100.0 / ((i_mb_count - i_all_intra)*4) ); x264_log( h, X264_LOG_INFO, "coded y intra: %.1f%%%s\n", h->stat.i_mb_cbp[0] * 100.0 / (i_all_intra*4), buf ); } int64_t fixed_pred_modes[4][9] = {{0}}; int64_t sum_pred_modes[4] = {0}; for( int i = 0; i <= I_PRED_16x16_DC_128; i++ ) { fixed_pred_modes[0][x264_mb_pred_mode16x16_fix[i]] += h->stat.i_mb_pred_mode[0][i]; sum_pred_modes[0] += h->stat.i_mb_pred_mode[0][i]; } if( sum_pred_modes[0] ) x264_log( h, X264_LOG_INFO, "i16 v,h,dc,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", fixed_pred_modes[0][0] * 100.0 / sum_pred_modes[0], fixed_pred_modes[0][1] * 100.0 / sum_pred_modes[0], fixed_pred_modes[0][2] * 100.0 / sum_pred_modes[0], fixed_pred_modes[0][3] * 100.0 / sum_pred_modes[0] ); for( int i = 1; i <= 2; i++ ) { for( int j = 0; j <= I_PRED_8x8_DC_128; j++ ) { fixed_pred_modes[i][x264_mb_pred_mode4x4_fix(j)] += h->stat.i_mb_pred_mode[i][j]; sum_pred_modes[i] += h->stat.i_mb_pred_mode[i][j]; } if( sum_pred_modes[i] ) x264_log( h, X264_LOG_INFO, "i%d v,h,dc,ddl,ddr,vr,hd,vl,hu: %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", (3-i)*4, fixed_pred_modes[i][0] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][1] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][2] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][3] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][4] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][5] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][6] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][7] * 100.0 / sum_pred_modes[i], fixed_pred_modes[i][8] * 100.0 / sum_pred_modes[i] ); } for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ ) { fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i]; sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i]; } if( sum_pred_modes[3] && !CHROMA444 ) x264_log( h, X264_LOG_INFO, "i8c dc,h,v,p: %2.0f%% %2.0f%% %2.0f%% %2.0f%%\n", fixed_pred_modes[3][0] * 100.0 / sum_pred_modes[3], fixed_pred_modes[3][1] * 100.0 / sum_pred_modes[3], fixed_pred_modes[3][2] * 100.0 / sum_pred_modes[3], fixed_pred_modes[3][3] * 100.0 / sum_pred_modes[3] ); if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->stat.i_frame_count[SLICE_TYPE_P] > 0 ) { buf[0] = 0; if( CHROMA_FORMAT ) sprintf( buf, " UV:%.1f%%", h->stat.i_wpred[1] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P] ); x264_log( h, X264_LOG_INFO, "Weighted P-Frames: Y:%.1f%%%s\n", h->stat.i_wpred[0] * 100.0 / h->stat.i_frame_count[SLICE_TYPE_P], buf ); } for( int i_list = 0; i_list < 2; i_list++ ) for( int i_slice = 0; i_slice < 2; i_slice++ ) { char *p = buf; int64_t i_den = 0; int i_max = 0; for( int i = 0; i < X264_REF_MAX*2; i++ ) if( h->stat.i_mb_count_ref[i_slice][i_list][i] ) { i_den += h->stat.i_mb_count_ref[i_slice][i_list][i]; i_max = i; } if( i_max == 0 ) continue; for( int i = 0; i <= i_max; i++ ) p += sprintf( p, " %4.1f%%", 100. * h->stat.i_mb_count_ref[i_slice][i_list][i] / i_den ); x264_log( h, X264_LOG_INFO, "ref %c L%d:%s\n", "PB"[i_slice], i_list, buf ); } if( h->param.analyse.b_ssim ) { float ssim = SUM3( h->stat.f_ssim_mean_y ) / duration; x264_log( h, X264_LOG_INFO, "SSIM Mean Y:%.7f (%6.3fdb)\n", ssim, calc_ssim_db( ssim ) ); } if( h->param.analyse.b_psnr ) { x264_log( h, X264_LOG_INFO, "PSNR Mean Y:%6.3f U:%6.3f V:%6.3f Avg:%6.3f Global:%6.3f kb/s:%.2f\n", SUM3( h->stat.f_psnr_mean_y ) / duration, SUM3( h->stat.f_psnr_mean_u ) / duration, SUM3( h->stat.f_psnr_mean_v ) / duration, SUM3( h->stat.f_psnr_average ) / duration, calc_psnr( SUM3( h->stat.f_ssd_global ), duration * i_yuv_size ), f_bitrate ); } else x264_log( h, X264_LOG_INFO, "kb/s:%.2f\n", f_bitrate ); } /* rc */ x264_ratecontrol_delete( h ); /* param */ x264_param_cleanup( &h->param ); x264_cqm_delete( h ); x264_free( h->nal_buffer ); x264_free( h->reconfig_h ); x264_analyse_free_costs( h ); x264_free( h->cost_table ); if( h->i_thread_frames > 1 ) h = h->thread[h->i_thread_phase]; /* frames */ x264_frame_delete_list( h->frames.unused[0] ); x264_frame_delete_list( h->frames.unused[1] ); x264_frame_delete_list( h->frames.current ); x264_frame_delete_list( h->frames.blank_unused ); h = h->thread[0]; for( int i = 0; i < h->i_thread_frames; i++ ) if( h->thread[i]->b_thread_active ) for( int j = 0; j < h->thread[i]->i_ref[0]; j++ ) if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate ) x264_frame_delete( h->thread[i]->fref[0][j] ); if( h->param.i_lookahead_threads > 1 ) for( int i = 0; i < h->param.i_lookahead_threads; i++ ) x264_free( h->lookahead_thread[i] ); for( int i = h->param.i_threads - 1; i >= 0; i-- ) { x264_frame_t **frame; if( !h->param.b_sliced_threads || i == 0 ) { for( frame = h->thread[i]->frames.reference; *frame; frame++ ) { assert( (*frame)->i_reference_count > 0 ); (*frame)->i_reference_count--; if( (*frame)->i_reference_count == 0 ) x264_frame_delete( *frame ); } frame = &h->thread[i]->fdec; if( *frame ) { assert( (*frame)->i_reference_count > 0 ); (*frame)->i_reference_count--; if( (*frame)->i_reference_count == 0 ) x264_frame_delete( *frame ); } x264_macroblock_cache_free( h->thread[i] ); } x264_macroblock_thread_free( h->thread[i], 0 ); x264_free( h->thread[i]->out.p_bitstream ); x264_free( h->thread[i]->out.nal ); x264_pthread_mutex_destroy( &h->thread[i]->mutex ); x264_pthread_cond_destroy( &h->thread[i]->cv ); x264_free( h->thread[i] ); } #if HAVE_OPENCL x264_opencl_close_library( ocl ); #endif } int x264_encoder_delayed_frames( x264_t *h ) { int delayed_frames = 0; if( h->i_thread_frames > 1 ) { for( int i = 0; i < h->i_thread_frames; i++ ) delayed_frames += h->thread[i]->b_thread_active; h = h->thread[h->i_thread_phase]; } for( int i = 0; h->frames.current[i]; i++ ) delayed_frames++; x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->next.mutex ); delayed_frames += h->lookahead->ifbuf.i_size + h->lookahead->next.i_size + h->lookahead->ofbuf.i_size; x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); return delayed_frames; } int x264_encoder_maximum_delayed_frames( x264_t *h ) { return h->frames.i_delay; } x264-master/encoder/lookahead.c000066400000000000000000000231251502133446700165740ustar00rootroot00000000000000/***************************************************************************** * lookahead.c: high-level lookahead functions ***************************************************************************** * Copyright (C) 2010-2025 Avail Media and x264 project * * Authors: Michael Kazmier * Alex Giladi * Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* LOOKAHEAD (threaded and non-threaded mode) * * Lookahead types: * [1] Slice type / scene cut; * * In non-threaded mode, we run the existing slicetype decision code as it was. * In threaded mode, we run in a separate thread, that lives between the calls * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for * the number of frames specified in rc_lookahead. Recommended setting is * # of bframes + # of threads. */ #include "common/common.h" #include "analyse.h" static void lookahead_shift( x264_sync_frame_list_t *dst, x264_sync_frame_list_t *src, int count ) { int i = count; while( i-- ) { assert( dst->i_size < dst->i_max_size ); assert( src->i_size ); dst->list[ dst->i_size++ ] = x264_frame_shift( src->list ); src->i_size--; } if( count ) { x264_pthread_cond_broadcast( &dst->cv_fill ); x264_pthread_cond_broadcast( &src->cv_empty ); } } static void lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb ) { if( h->lookahead->last_nonb ) x264_frame_push_unused( h, h->lookahead->last_nonb ); h->lookahead->last_nonb = new_nonb; new_nonb->i_reference_count++; } #if HAVE_THREAD static void lookahead_slicetype_decide( x264_t *h ) { x264_slicetype_decide( h ); lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size ) x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->next.mutex ); lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) x264_slicetype_analyse( h, shift_frames ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); } REALIGN_STACK static void *lookahead_thread( x264_t *h ) { while( 1 ) { x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); if( h->lookahead->b_exit_thread ) { x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); break; } x264_pthread_mutex_lock( &h->lookahead->next.mutex ); int shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size ); lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift ); x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length + h->param.b_vfr_input ) { while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread ) x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex ); x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); } else { x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); lookahead_slicetype_decide( h ); } } /* end of input frames */ x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->next.mutex ); lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size ); x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); while( h->lookahead->next.i_size ) lookahead_slicetype_decide( h ); x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); h->lookahead->b_thread_active = 0; x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_fill ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); return NULL; } #endif int x264_lookahead_init( x264_t *h, int i_slicetype_length ) { x264_lookahead_t *look; CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) ); for( int i = 0; i < h->param.i_threads; i++ ) h->thread[i]->lookahead = look; look->i_last_keyframe = - h->param.i_keyint_max; look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead)) && !h->param.rc.b_stat_read; look->i_slicetype_length = i_slicetype_length; /* init frame lists */ if( x264_sync_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) || x264_sync_frame_list_init( &look->next, h->frames.i_delay+3 ) || x264_sync_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) ) goto fail; if( !h->param.i_sync_lookahead ) return 0; x264_t *look_h = h->thread[h->param.i_threads]; *look_h = *h; if( x264_macroblock_cache_allocate( look_h ) ) goto fail; if( x264_macroblock_thread_allocate( look_h, 1 ) < 0 ) goto fail; if( x264_pthread_create( &look->thread_handle, NULL, (void*)lookahead_thread, look_h ) ) goto fail; look->b_thread_active = 1; return 0; fail: x264_free( look ); return -1; } void x264_lookahead_delete( x264_t *h ) { if( h->param.i_sync_lookahead ) { x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); h->lookahead->b_exit_thread = 1; x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill ); x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); x264_pthread_join( h->lookahead->thread_handle, NULL ); x264_macroblock_cache_free( h->thread[h->param.i_threads] ); x264_macroblock_thread_free( h->thread[h->param.i_threads], 1 ); x264_free( h->thread[h->param.i_threads] ); } x264_sync_frame_list_delete( &h->lookahead->ifbuf ); x264_sync_frame_list_delete( &h->lookahead->next ); if( h->lookahead->last_nonb ) x264_frame_push_unused( h, h->lookahead->last_nonb ); x264_sync_frame_list_delete( &h->lookahead->ofbuf ); x264_free( h->lookahead ); } void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame ) { if( h->param.i_sync_lookahead ) x264_sync_frame_list_push( &h->lookahead->ifbuf, frame ); else x264_sync_frame_list_push( &h->lookahead->next, frame ); } int x264_lookahead_is_empty( x264_t *h ) { x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); x264_pthread_mutex_lock( &h->lookahead->next.mutex ); int b_empty = !h->lookahead->next.i_size && !h->lookahead->ofbuf.i_size; x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); return b_empty; } static void lookahead_encoder_shift( x264_t *h ) { if( !h->lookahead->ofbuf.i_size ) return; int i_frames = h->lookahead->ofbuf.list[0]->i_bframes + 1; while( i_frames-- ) { x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) ); h->lookahead->ofbuf.i_size--; } x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty ); } void x264_lookahead_get_frames( x264_t *h ) { if( h->param.i_sync_lookahead ) { /* We have a lookahead thread, so get frames from there */ x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active ) x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex ); lookahead_encoder_shift( h ); x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); } else { /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */ if( h->frames.current[0] || !h->lookahead->next.i_size ) return; x264_slicetype_decide( h ); lookahead_update_last_nonb( h, h->lookahead->next.list[0] ); int shift_frames = h->lookahead->next.list[0]->i_bframes + 1; lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, shift_frames ); /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) x264_slicetype_analyse( h, shift_frames ); lookahead_encoder_shift( h ); } } x264-master/encoder/macroblock.c000066400000000000000000001620161502133446700167640ustar00rootroot00000000000000/***************************************************************************** * macroblock.c: macroblock encoding ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" /* These chroma DC functions don't have assembly versions and are only used here. */ #define ZIG(i,y,x) level[i] = dct[x*2+y]; static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] ) { ZIG(0,0,0) ZIG(1,0,1) ZIG(2,1,0) ZIG(3,1,1) } #undef ZIG static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] ) { level[0] = dct[0]; level[1] = dct[2]; level[2] = dct[1]; level[3] = dct[4]; level[4] = dct[6]; level[5] = dct[3]; level[6] = dct[5]; level[7] = dct[7]; } #define IDCT_DEQUANT_2X2_START \ int d0 = dct[0] + dct[1]; \ int d1 = dct[2] + dct[3]; \ int d2 = dct[0] - dct[1]; \ int d3 = dct[2] - dct[3]; \ int dmf = dequant_mf[i_qp%6][0] << i_qp/6; static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp ) { IDCT_DEQUANT_2X2_START dct4x4[0][0] = (d0 + d1) * dmf >> 5; dct4x4[1][0] = (d0 - d1) * dmf >> 5; dct4x4[2][0] = (d2 + d3) * dmf >> 5; dct4x4[3][0] = (d2 - d3) * dmf >> 5; } static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp ) { IDCT_DEQUANT_2X2_START dct[0] = (d0 + d1) * dmf >> 5; dct[1] = (d0 - d1) * dmf >> 5; dct[2] = (d2 + d3) * dmf >> 5; dct[3] = (d2 - d3) * dmf >> 5; } #undef IDCT_2X2_DEQUANT_START static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] ) { int d0 = dct4x4[0][0] + dct4x4[1][0]; int d1 = dct4x4[2][0] + dct4x4[3][0]; int d2 = dct4x4[0][0] - dct4x4[1][0]; int d3 = dct4x4[2][0] - dct4x4[3][0]; d[0] = d0 + d1; d[2] = d2 + d3; d[1] = d0 - d1; d[3] = d2 - d3; dct4x4[0][0] = 0; dct4x4[1][0] = 0; dct4x4[2][0] = 0; dct4x4[3][0] = 0; } static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count ) { if( WORD_SIZE == 8 ) { for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) ) if( M64( &v[i] ) ) return 1; } else { for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) ) if( M32( &v[i] ) ) return 1; } return 0; } /* All encoding functions must output the correct CBP and NNZ values. * The entropy coding functions will check CBP first, then NNZ, before * actually reading the DCT coefficients. NNZ still must be correct even * if CBP is zero because of the use of NNZ values for context selection. * "NNZ" need only be 0 or 1 rather than the exact coefficient count because * that is only needed in CAVLC, and will be calculated by CAVLC's residual * coding and stored as necessary. */ /* This means that decimation can be done merely by adjusting the CBP and NNZ * rather than memsetting the coefficients. */ static void mb_encode_i16x16( x264_t *h, int p, int i_qp ) { pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; int i_quant_cat = p ? CQM_4IC : CQM_4IY; int i_mode = h->mb.i_intra16x16_pred_mode; if( h->mb.b_lossless ) x264_predict_lossless_16x16( h, p, i_mode ); else h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] ); if( h->mb.b_lossless ) { for( int i = 0; i < 16; i++ ) { int oe = block_idx_xy_fenc[i]; int od = block_idx_xy_fdec[i]; nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] ); h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz; block_cbp |= nz; } h->mb.i_cbp_luma |= block_cbp * 0xf; h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 ); h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 ); return; } CLEAR_16x16_NNZ( p ); h->dctf.sub16x16_dct( dct4x4, p_src, p_dst ); if( h->mb.b_noise_reduction ) for( int idx = 0; idx < 16; idx++ ) h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); for( int idx = 0; idx < 16; idx++ ) { dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0]; dct4x4[idx][0] = 0; } if( h->mb.b_trellis ) { for( int idx = 0; idx < 16; idx++ ) if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) ) { block_cbp = 0xf; h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; } } else { for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); if( nz ) { block_cbp = 0xf; FOREACH_BIT( idx, i8x8*4, nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; } } } } /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */ /* More useful with CAVLC, but still useful with CABAC. */ if( decimate_score < 6 ) { CLEAR_16x16_NNZ( p ); block_cbp = 0; } else h->mb.i_cbp_luma |= block_cbp; h->dctf.dct4x4dc( dct_dc4x4 ); if( h->mb.b_trellis ) nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p ); else nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 ); h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz; if( nz ) { h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 ); /* output samples to fdec */ h->dctf.idct4x4dc( dct_dc4x4 ); h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp ); /* XXX not inversed */ if( block_cbp ) for( int i = 0; i < 16; i++ ) dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]]; } /* put pixels to fdec */ if( block_cbp ) h->dctf.add16x16_idct( p_dst, dct4x4 ); else if( nz ) h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 ); } /* Round down coefficients losslessly in DC-only chroma blocks. * Unlike luma blocks, this can't be done with a lookup table or * other shortcut technique because of the interdependencies * between the coefficients due to the chroma DC transform. */ static ALWAYS_INLINE int mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 ) { int dmf = dequant_mf[i_qp%6][0] << i_qp/6; /* If the QP is too high, there's no benefit to rounding optimization. */ if( dmf > 32*64 ) return 1; if( chroma422 ) return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf ); else return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf ); } static ALWAYS_INLINE void mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 ) { int nz, nz_dc; int b_decimate = b_inter && h->mb.b_dct_decimate; int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter]; ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] ); h->mb.i_cbp_chroma = 0; h->nr_count[2] += h->mb.b_noise_reduction * 4; M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; if( chroma422 ) { M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; } /* Early termination: check variance of chroma residual before encoding. * Don't bother trying early termination at low QPs. * Values are experimentally derived. */ if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction ) { int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; ALIGNED_ARRAY_8( int, ssd,[2] ); int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8; if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 ) { h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; for( int ch = 0; ch < 2; ch++ ) { if( ssd[ch] > thresh ) { pixel *p_src = h->mb.pic.p_fenc[1+ch]; pixel *p_dst = h->mb.pic.p_fdec[1+ch]; if( chroma422 ) /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */ h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst ); else h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst ); if( h->mb.b_trellis ) nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch ); else { nz_dc = 0; for( int i = 0; i <= chroma422; i++ ) nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1, h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 ); } if( nz_dc ) { if( !mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) ) continue; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1; if( chroma422 ) { zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 ); } else { zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp ); } for( int i = 0; i <= chroma422; i++ ) h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] ); h->mb.i_cbp_chroma = 1; } } } return; } } for( int ch = 0; ch < 2; ch++ ) { pixel *p_src = h->mb.pic.p_fenc[1+ch]; pixel *p_dst = h->mb.pic.p_fdec[1+ch]; int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 }; for( int i = 0; i < (chroma422?8:4); i++ ) { int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE; int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE; nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] ); h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz; h->mb.i_cbp_chroma |= nz; } h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 ); continue; } for( int i = 0; i <= chroma422; i++ ) h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); if( h->mb.b_noise_reduction ) for( int i = 0; i < (chroma422?8:4); i++ ) h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 ); if( chroma422 ) h->dctf.dct2x4dc( dct_dc, dct4x4 ); else dct2x2dc( dct_dc, dct4x4 ); /* calculate dct coeffs */ for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ ) { if( h->mb.b_trellis ) { for( int i4x4 = 0; i4x4 < 4; i4x4++ ) { if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) ) { int idx = 16+ch*16+i8x8*8+i4x4; h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); if( i_decimate_score < 7 ) i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; nz_ac = 1; } } } else { nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); nz_ac |= nz; FOREACH_BIT( i4x4, 0, nz ) { int idx = 16+ch*16+i8x8*8+i4x4; h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); if( i_decimate_score < 7 ) i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; } } } if( h->mb.b_trellis ) nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch ); else { nz_dc = 0; for( int i = 0; i <= chroma422; i++ ) nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1, h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 ); } h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc; if( i_decimate_score < 7 || !nz_ac ) { /* Decimate the block */ M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0; if( chroma422 ) { M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0; } if( !nz_dc ) /* Whole block is empty */ continue; if( !mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) ) { h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0; continue; } /* DC-only */ if( chroma422 ) { zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 ); } else { zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp ); } for( int i = 0; i <= chroma422; i++ ) h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] ); } else { h->mb.i_cbp_chroma = 1; if( nz_dc ) { if( chroma422 ) { zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 ); } else { zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp ); } } for( int i = 0; i <= chroma422; i++ ) h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] ); } } /* 0 = none, 1 = DC only, 2 = DC+AC */ h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma); } void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp ) { if( CHROMA_FORMAT == CHROMA_420 ) mb_encode_chroma_internal( h, b_inter, i_qp, 0 ); else mb_encode_chroma_internal( h, b_inter, i_qp, 1 ); } static void macroblock_encode_skip( x264_t *h ) { M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0; if( CHROMA_FORMAT >= CHROMA_422 ) { M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0; } h->mb.i_cbp_luma = 0; h->mb.i_cbp_chroma = 0; h->mb.cbp[h->mb.i_mb_xy] = 0; } /***************************************************************************** * Intra prediction for predictive lossless mode. *****************************************************************************/ void x264_predict_lossless_chroma( x264_t *h, int i_mode ) { int height = 16 >> CHROMA_V_SHIFT; if( i_mode == I_PRED_CHROMA_V ) { h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height ); h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height ); memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*SIZEOF_PIXEL ); memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*SIZEOF_PIXEL ); } else if( i_mode == I_PRED_CHROMA_H ) { h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height ); h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height ); x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 ); x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 ); if( CHROMA_FORMAT == CHROMA_422 ) { x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 ); x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 ); } } else { h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } } void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode ) { int stride = h->fenc->i_stride[p] << MB_INTERLACED; pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride; if( i_mode == I_PRED_4x4_V ) { h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 ); memcpy( p_dst, p_dst-FDEC_STRIDE, 4*SIZEOF_PIXEL ); } else if( i_mode == I_PRED_4x4_H ) { h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 ); for( int i = 0; i < 4; i++ ) p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; } else h->predict_4x4[i_mode]( p_dst ); } void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] ) { int stride = h->fenc->i_stride[p] << MB_INTERLACED; pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride; if( i_mode == I_PRED_8x8_V ) { h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 ); memcpy( p_dst, &edge[16], 8*SIZEOF_PIXEL ); } else if( i_mode == I_PRED_8x8_H ) { h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 ); for( int i = 0; i < 8; i++ ) p_dst[i*FDEC_STRIDE] = edge[14-i]; } else h->predict_8x8[i_mode]( p_dst, edge ); } void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode ) { int stride = h->fenc->i_stride[p] << MB_INTERLACED; pixel *p_dst = h->mb.pic.p_fdec[p]; if( i_mode == I_PRED_16x16_V ) { h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 ); memcpy( p_dst, p_dst-FDEC_STRIDE, 16*SIZEOF_PIXEL ); } else if( i_mode == I_PRED_16x16_H ) { h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 ); for( int i = 0; i < 16; i++ ) p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; } else h->predict_16x16[i_mode]( p_dst ); } /***************************************************************************** * x264_macroblock_encode: *****************************************************************************/ static ALWAYS_INLINE void macroblock_encode_internal( x264_t *h, int plane_count, int chroma ) { int i_qp = h->mb.i_qp; int b_decimate = h->mb.b_dct_decimate; int b_force_no_skip = 0; int nz; h->mb.i_cbp_luma = 0; for( int p = 0; p < plane_count; p++ ) h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0; if( h->mb.i_type == I_PCM ) { /* if PCM is chosen, we need to store reconstructed frame data */ for( int p = 0; p < plane_count; p++ ) h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 ); if( chroma ) { int height = 16 >> CHROMA_V_SHIFT; h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height ); } return; } if( !h->mb.b_allow_skip ) { b_force_no_skip = 1; if( IS_SKIP(h->mb.i_type) ) { if( h->mb.i_type == P_SKIP ) h->mb.i_type = P_L0; else if( h->mb.i_type == B_SKIP ) h->mb.i_type = B_DIRECT; } } if( h->mb.i_type == P_SKIP ) { /* don't do pskip motion compensation if it was already done in macroblock_analyse */ if( !h->mb.b_skip_mc ) { int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0], h->mb.mv_min[0], h->mb.mv_max[0] ); int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1], h->mb.mv_min[1], h->mb.mv_max[1] ); for( int p = 0; p < plane_count; p++ ) h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE, &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p], mvx, mvy, 16, 16, &h->sh.weight[0][p] ); if( chroma ) { int v_shift = CHROMA_V_SHIFT; int height = 16 >> v_shift; /* Special case for mv0, which is (of course) very common in P-skip mode. */ if( mvx | mvy ) h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], mvx, 2*mvy>>v_shift, 8, height ); else h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], height ); if( h->sh.weight[0][1].weightfn ) h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &h->sh.weight[0][1], height ); if( h->sh.weight[0][2].weightfn ) h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &h->sh.weight[0][2], height ); } } macroblock_encode_skip( h ); return; } if( h->mb.i_type == B_SKIP ) { /* don't do bskip motion compensation if it was already done in macroblock_analyse */ if( !h->mb.b_skip_mc ) x264_mb_mc( h ); macroblock_encode_skip( h ); return; } if( h->mb.i_type == I_16x16 ) { h->mb.b_transform_8x8 = 0; for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) mb_encode_i16x16( h, p, i_qp ); } else if( h->mb.i_type == I_8x8 ) { h->mb.b_transform_8x8 = 1; /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 ); M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0]; M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1]; M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2]; M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3]; h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp; /* In RD mode, restore the now-overwritten DCT data. */ if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) ); } for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0; i < 4; i++ ) { int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]]; x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 ); } } } else if( h->mb.i_type == I_4x4 ) { h->mb.b_transform_8x8 = 0; /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 ); M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0]; M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1]; M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2]; M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3]; h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp; /* In RD mode, restore the now-overwritten DCT data. */ if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) ); } for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0; i < 16; i++ ) { pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]]; int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]; if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] ); x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 ); } } } else /* Inter MB */ { int i_decimate_mb = 0; /* Don't repeat motion compensation if it was already done in non-RD transform analysis */ if( !h->mb.b_skip_mc ) x264_mb_mc( h ); if( h->mb.b_lossless ) { if( h->mb.b_transform_8x8 ) for( int p = 0; p < plane_count; p++ ) for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { int x = i8x8&1; int y = i8x8>>1; nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE, h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE ); STORE_8x8_NNZ( p, i8x8, nz ); h->mb.i_cbp_luma |= nz << i8x8; } else for( int p = 0; p < plane_count; p++ ) for( int i4x4 = 0; i4x4 < 16; i4x4++ ) { nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4], h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4], h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz; h->mb.i_cbp_luma |= nz << (i4x4>>2); } } else if( h->mb.b_transform_8x8 ) { ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] ); b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_8PC : CQM_8PY; CLEAR_16x16_NNZ( p ); h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] ); h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4; int plane_cbp = 0; for( int idx = 0; idx < 4; idx++ ) { nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx ); if( nz ) { h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] ); if( b_decimate ) { int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] ); i_decimate_mb += i_decimate_8x8; if( i_decimate_8x8 >= 4 ) plane_cbp |= 1<= 6 || !b_decimate ) { h->mb.i_cbp_luma |= plane_cbp; FOREACH_BIT( idx, 0, plane_cbp ) { h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[quant_cat], i_qp ); h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] ); STORE_8x8_NNZ( p, idx, 1 ); } } } } else { ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; CLEAR_16x16_NNZ( p ); h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] ); if( h->mb.b_noise_reduction ) { h->nr_count[0+!!p*2] += 16; for( int idx = 0; idx < 16; idx++ ) h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); } int plane_cbp = 0; for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { int i_decimate_8x8 = b_decimate ? 0 : 6; int nnz8x8 = 0; if( h->mb.b_trellis ) { for( int i4x4 = 0; i4x4 < 4; i4x4++ ) { int idx = i8x8*4+i4x4; if( x264_quant_4x4_trellis( h, dct4x4[idx], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) ) { h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp ); if( i_decimate_8x8 < 6 ) i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1; nnz8x8 = 1; } } } else { nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); if( nz ) { FOREACH_BIT( idx, i8x8*4, nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp ); if( i_decimate_8x8 < 6 ) i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1; } } } if( nnz8x8 ) { i_decimate_mb += i_decimate_8x8; if( i_decimate_8x8 < 4 ) STORE_8x8_NNZ( p, i8x8, 0 ); else plane_cbp |= 1<mb.i_cbp_luma |= plane_cbp; FOREACH_BIT( i8x8, 0, plane_cbp ) { h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); } } } } } /* encode chroma */ if( chroma ) { if( IS_INTRA( h->mb.i_type ) ) { int i_mode = h->mb.i_chroma_pred_mode; if( h->mb.b_lossless ) x264_predict_lossless_chroma( h, i_mode ); else { h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } } /* encode the 8x8 blocks */ x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp ); } else h->mb.i_cbp_chroma = 0; /* store cbp */ int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma; if( h->param.b_cabac ) cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC ]] << 8 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10; h->mb.cbp[h->mb.i_mb_xy] = cbp; /* Check for P_SKIP * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account * (if multiple mv give same result)*/ if( !b_force_no_skip ) { if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv ) && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) { h->mb.i_type = P_SKIP; } /* Check for B_SKIP */ if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) ) { h->mb.i_type = B_SKIP; } } } void x264_macroblock_encode( x264_t *h ) { if( CHROMA444 ) macroblock_encode_internal( h, 3, 0 ); else if( CHROMA_FORMAT ) macroblock_encode_internal( h, 1, 1 ); else macroblock_encode_internal( h, 1, 0 ); } /***************************************************************************** * x264_macroblock_probe_skip: * Check if the current MB could be encoded as a [PB]_SKIP *****************************************************************************/ static ALWAYS_INLINE int macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); ALIGNED_ARRAY_64( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); int i_qp = h->mb.i_qp; for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; if( !b_bidir ) { /* Get the MV */ mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] ); mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] ); /* Motion compensation */ h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE, &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p], mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] ); } for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ ) { int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8; int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8; h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset, h->mb.pic.p_fdec[p] + fdec_offset ); if( h->mb.b_noise_reduction ) for( int i4x4 = 0; i4x4 < 4; i4x4++ ) h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); FOREACH_BIT( idx, 0, nz ) { h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); i_decimate_mb += h->quantf.decimate_score16( dctscan ); if( i_decimate_mb >= 6 ) return 0; } } } if( chroma == CHROMA_420 || chroma == CHROMA_422 ) { i_qp = h->mb.i_chroma_qp; int chroma422 = chroma == CHROMA_422; int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; int ssd; ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] ); if( !b_bidir ) { /* Special case for mv0, which is (of course) very common in P-skip mode. */ if( M32( mvp ) ) h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], mvp[0], mvp[1] * (1<mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], chroma422?16:8 ); } for( int ch = 0; ch < 2; ch++ ) { pixel *p_src = h->mb.pic.p_fenc[1+ch]; pixel *p_dst = h->mb.pic.p_fdec[1+ch]; if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, &h->sh.weight[0][1+ch], chroma422?16:8 ); /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); if( ssd < thresh ) continue; /* The vast majority of chroma checks will terminate during the DC check or the higher * threshold check, so we can save time by doing a DC-only DCT. */ if( h->mb.b_noise_reduction ) { for( int i = 0; i <= chroma422; i++ ) h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ ) { h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); dct_dc[i4x4] = dct4x4[i4x4][0]; dct4x4[i4x4][0] = 0; } } else { if( chroma422 ) h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst ); else h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst ); } for( int i = 0; i <= chroma422; i++ ) if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1, h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) ) return 0; /* If there wasn't a termination in DC, we can check against a much higher threshold. */ if( ssd < thresh*4 ) continue; if( !h->mb.b_noise_reduction ) for( int i = 0; i <= chroma422; i++ ) { h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); dct4x4[i*4+0][0] = 0; dct4x4[i*4+1][0] = 0; dct4x4[i*4+2][0] = 0; dct4x4[i*4+3][0] = 0; } /* calculate dct coeffs */ for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ ) { int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); FOREACH_BIT( idx, i8x8*4, nz ) { h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); i_decimate_mb += h->quantf.decimate_score15( dctscan ); if( i_decimate_mb >= 7 ) return 0; } } } } h->mb.b_skip_mc = 1; return 1; } int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) { if( CHROMA_FORMAT == CHROMA_420 ) return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 ); else if( CHROMA_FORMAT == CHROMA_422 ) return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 ); else if( CHROMA_FORMAT == CHROMA_444 ) return macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 ); else return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_400 ); } /**************************************************************************** * DCT-domain noise reduction / adaptive deadzone * from libavcodec ****************************************************************************/ void x264_noise_reduction_update( x264_t *h ) { h->nr_offset = h->nr_offset_denoise; h->nr_residual_sum = h->nr_residual_sum_buf[0]; h->nr_count = h->nr_count_buf[0]; for( int cat = 0; cat < 3 + CHROMA444; cat++ ) { int dct8x8 = cat&1; int size = dct8x8 ? 64 : 16; const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) ) { for( int i = 0; i < size; i++ ) h->nr_residual_sum[cat][i] >>= 1; h->nr_count[cat] >>= 1; } for( int i = 0; i < size; i++ ) h->nr_offset[cat][i] = ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat] + h->nr_residual_sum[cat][i]/2) / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1); /* Don't denoise DC coefficients */ h->nr_offset[cat][0] = 0; } } /***************************************************************************** * RD only; 4 calls to this do not make up for one macroblock_encode. * doesn't transform chroma dc. *****************************************************************************/ static ALWAYS_INLINE void macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma ) { int b_decimate = h->mb.b_dct_decimate; int i_qp = h->mb.i_qp; int x = i8&1; int y = i8>>1; int nz; int chroma422 = chroma == CHROMA_422; h->mb.i_cbp_chroma = 0; h->mb.i_cbp_luma &= ~(1 << i8); if( !h->mb.b_skip_mc ) x264_mb_mc_8x8( h, i8 ); if( h->mb.b_lossless ) { for( int p = 0; p < plane_count; p++ ) { pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; int nnz8x8 = 0; if( h->mb.b_transform_8x8 ) { nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec ); STORE_8x8_NNZ( p, i8, nnz8x8 ); } else { for( int i4 = i8*4; i4 < i8*4+4; i4++ ) { nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4], h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4], h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] ); h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz; nnz8x8 |= nz; } } h->mb.i_cbp_luma |= nnz8x8 << i8; } if( chroma == CHROMA_420 || chroma == CHROMA_422 ) { for( int ch = 0; ch < 2; ch++ ) { dctcoef dc; pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ ) { int offset = chroma422 ? 8*y + 2*i4x4 + x : i8; nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc ); h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz; } } h->mb.i_cbp_chroma = 0x02; } } else { if( h->mb.b_transform_8x8 ) { for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_8PC : CQM_8PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 ); if( nnz8x8 ) { h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 ); if( b_decimate && !h->mb.b_trellis ) nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] ); if( nnz8x8 ) { h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp ); h->dctf.add8x8_idct8( p_fdec, dct8x8 ); STORE_8x8_NNZ( p, i8, 1 ); h->mb.i_cbp_luma |= 1 << i8; } else STORE_8x8_NNZ( p, i8, 0 ); } else STORE_8x8_NNZ( p, i8, 0 ); } } else { for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; int i_decimate_8x8 = b_decimate ? 0 : 4; ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] ); int nnz8x8 = 0; h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); STORE_8x8_NNZ( p, i8, 0 ); if( h->mb.b_noise_reduction ) for( int idx = 0; idx < 4; idx++ ) h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); if( h->mb.b_trellis ) { for( int i4x4 = 0; i4x4 < 4; i4x4++ ) { if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) ) { h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] ); h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp ); if( i_decimate_8x8 < 4 ) i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] ); h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1; nnz8x8 = 1; } } } else { nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); if( nz ) { FOREACH_BIT( i4x4, 0, nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] ); h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp ); if( i_decimate_8x8 < 4 ) i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] ); h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1; } } } if( nnz8x8 ) { /* decimate this 8x8 block */ if( i_decimate_8x8 < 4 ) STORE_8x8_NNZ( p, i8, 0 ); else { h->dctf.add8x8_idct( p_fdec, dct4x4 ); h->mb.i_cbp_luma |= 1 << i8; } } } } if( chroma == CHROMA_420 || chroma == CHROMA_422 ) { i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] ); pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ ) { h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE ); if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); dct4x4[i4x4][0] = 0; if( h->mb.b_trellis ) nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 ); else nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8; h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz; if( nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] ); h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp ); h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] ); } } } h->mb.i_cbp_chroma = 0x02; } } } void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { if( CHROMA_FORMAT == CHROMA_420 ) macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 ); else if( CHROMA_FORMAT == CHROMA_422 ) macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 ); else if( CHROMA_FORMAT == CHROMA_444 ) macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 ); else macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_400 ); } /***************************************************************************** * RD only, luma only (for 4:2:0) *****************************************************************************/ static ALWAYS_INLINE void macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count ) { int i_qp = h->mb.i_qp; for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]]; pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]]; int nz; /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */ if( h->mb.b_lossless ) { nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; } else { ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; if( nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 ); h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp ); h->dctf.add4x4_idct( p_fdec, dct4x4 ); } } } } void x264_macroblock_encode_p4x4( x264_t *h, int i8 ) { if( CHROMA444 ) macroblock_encode_p4x4_internal( h, i8, 3 ); else macroblock_encode_p4x4_internal( h, i8, 1 ); } x264-master/encoder/macroblock.h000066400000000000000000000222241502133446700167650ustar00rootroot00000000000000/***************************************************************************** * macroblock.h: macroblock encoding ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ENCODER_MACROBLOCK_H #define X264_ENCODER_MACROBLOCK_H #include "common/macroblock.h" #define x264_rdo_init x264_template(rdo_init) void x264_rdo_init( void ); #define x264_macroblock_probe_skip x264_template(macroblock_probe_skip) int x264_macroblock_probe_skip( x264_t *h, int b_bidir ); #define x264_macroblock_probe_pskip( h )\ x264_macroblock_probe_skip( h, 0 ) #define x264_macroblock_probe_bskip( h )\ x264_macroblock_probe_skip( h, 1 ) #define x264_predict_lossless_4x4 x264_template(predict_lossless_4x4) void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode ); #define x264_predict_lossless_8x8 x264_template(predict_lossless_8x8) void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] ); #define x264_predict_lossless_16x16 x264_template(predict_lossless_16x16) void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode ); #define x264_predict_lossless_chroma x264_template(predict_lossless_chroma) void x264_predict_lossless_chroma( x264_t *h, int i_mode ); #define x264_macroblock_encode x264_template(macroblock_encode) void x264_macroblock_encode ( x264_t *h ); #define x264_macroblock_write_cabac x264_template(macroblock_write_cabac) void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb ); #define x264_macroblock_write_cavlc x264_template(macroblock_write_cavlc) void x264_macroblock_write_cavlc ( x264_t *h ); #define x264_macroblock_encode_p8x8 x264_template(macroblock_encode_p8x8) void x264_macroblock_encode_p8x8( x264_t *h, int i8 ); #define x264_macroblock_encode_p4x4 x264_template(macroblock_encode_p4x4) void x264_macroblock_encode_p4x4( x264_t *h, int i4 ); #define x264_mb_encode_chroma x264_template(mb_encode_chroma) void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp ); #define x264_cabac_mb_skip x264_template(cabac_mb_skip) void x264_cabac_mb_skip( x264_t *h, int b_skip ); #define x264_cabac_block_residual_c x264_template(cabac_block_residual_c) void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); #define x264_cabac_block_residual_8x8_rd_c x264_template(cabac_block_residual_8x8_rd_c) void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); #define x264_cabac_block_residual_rd_c x264_template(cabac_block_residual_rd_c) void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); #define x264_quant_luma_dc_trellis x264_template(quant_luma_dc_trellis) int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx ); #define x264_quant_chroma_dc_trellis x264_template(quant_chroma_dc_trellis) int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx ); #define x264_quant_4x4_trellis x264_template(quant_4x4_trellis) int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ); #define x264_quant_8x8_trellis x264_template(quant_8x8_trellis) int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ); #define x264_noise_reduction_update x264_template(noise_reduction_update) void x264_noise_reduction_update( x264_t *h ); static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx ) { int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY); if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( dct, h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); if( h->mb.b_trellis ) return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*16 ); else return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); } static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int ctx_block_cat, int b_intra, int p, int idx ) { int i_quant_cat = b_intra ? (p?CQM_8IC:CQM_8IY) : (p?CQM_8PC:CQM_8PY); if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( dct, h->nr_residual_sum[1+!!p*2], h->nr_offset[1+!!p*2], 64 ); if( h->mb.b_trellis ) return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, !!p, idx+p*4 ); else return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] ); } #define STORE_8x8_NNZ( p, idx, nz )\ do\ {\ M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+0] ) = (nz) * 0x0101;\ M16( &h->mb.cache.non_zero_count[x264_scan8[p*16+idx*4]+8] ) = (nz) * 0x0101;\ } while( 0 ) #define CLEAR_16x16_NNZ( p ) \ do\ {\ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 0*8] ) = 0;\ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 1*8] ) = 0;\ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 2*8] ) = 0;\ M32( &h->mb.cache.non_zero_count[x264_scan8[16*p] + 3*8] ) = 0;\ } while( 0 ) /* A special for loop that iterates branchlessly over each set * bit in a 4-bit input. */ #define FOREACH_BIT(idx,start,mask) for( int idx = start, msk = mask, skip; msk && (skip = x264_ctz_4bit(msk), idx += skip, msk >>= skip+1, 1); idx++ ) static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode, int b_predict ) { int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); if( b_predict ) { if( h->mb.b_lossless ) x264_predict_lossless_4x4( h, p_dst, p, idx, i_mode ); else h->predict_4x4[i_mode]( p_dst ); } if( h->mb.b_lossless ) { nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+idx], p_src, p_dst ); h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz; h->mb.i_cbp_luma |= nz<<(idx>>2); return; } h->dctf.sub4x4_dct( dct4x4, p_src, p_dst ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 1, p, idx ); h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = nz; if( nz ) { h->mb.i_cbp_luma |= 1<<(idx>>2); h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4 ); h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[p?CQM_4IC:CQM_4IY], i_qp ); h->dctf.add4x4_idct( p_dst, dct4x4 ); } } static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge, int b_predict ) { int x = idx&1; int y = idx>>1; int nz; pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( b_predict ) { if( !edge ) { h->predict_8x8_filter( p_dst, edge_buf, h->mb.i_neighbour8[idx], x264_pred_i4x4_neighbors[i_mode] ); edge = edge_buf; } if( h->mb.b_lossless ) x264_predict_lossless_8x8( h, p_dst, p, idx, i_mode, edge ); else h->predict_8x8[i_mode]( p_dst, edge ); } if( h->mb.b_lossless ) { nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+idx], p_src, p_dst ); STORE_8x8_NNZ( p, idx, nz ); h->mb.i_cbp_luma |= nz<dctf.sub8x8_dct8( dct8x8, p_src, p_dst ); nz = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 1, p, idx ); if( nz ) { h->mb.i_cbp_luma |= 1<zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8 ); h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[p?CQM_8IC:CQM_8IY], i_qp ); h->dctf.add8x8_idct8( p_dst, dct8x8 ); STORE_8x8_NNZ( p, idx, 1 ); } else STORE_8x8_NNZ( p, idx, 0 ); } #endif x264-master/encoder/me.c000066400000000000000000001527041502133446700152540ustar00rootroot00000000000000/***************************************************************************** * me.c: motion estimation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #include "me.h" /* presets selected from good points on the speed-vs-quality curve of several test videos * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel } * where me_* are the number of EPZS iterations run on all candidate block types, * and refine_* are run only on the winner. * the subme=8,9 values are much higher because any amount of satd search makes * up its time by reducing the number of qpel-rd iterations. */ static const uint8_t subpel_iterations[][4] = {{0,0,0,0}, {1,1,0,0}, {0,1,1,0}, {0,2,1,0}, {0,2,1,1}, {0,2,1,2}, {0,0,2,2}, {0,0,2,2}, {0,0,4,10}, {0,0,4,10}, {0,0,4,10}, {0,0,4,10}}; /* (x-1)%6 */ static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0}; /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}}; static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}}; static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ); #define BITS_MVD( mx, my )\ (p_cost_mvx[(mx)*4] + p_cost_mvy[(my)*4]) #define COST_MV( mx, my )\ do\ {\ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\ &p_fref_w[(my)*stride+(mx)], stride )\ + BITS_MVD(mx,my);\ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ } while( 0 ) #define COST_MV_HPEL( mx, my, cost )\ do\ {\ intptr_t stride2 = 16;\ pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\ cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\ + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\ } while( 0 ) #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ {\ pixel *pix_base = p_fref_w + bmx + bmy*stride;\ h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\ pix_base + (m0x) + (m0y)*stride,\ pix_base + (m1x) + (m1y)*stride,\ pix_base + (m2x) + (m2y)*stride,\ stride, costs );\ (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\ (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\ (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\ } #define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\ {\ pixel *pix_base = p_fref_w + bmx + bmy*stride;\ h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ pix_base + (m0x) + (m0y)*stride,\ pix_base + (m1x) + (m1y)*stride,\ pix_base + (m2x) + (m2y)*stride,\ pix_base + (m3x) + (m3y)*stride,\ stride, costs );\ (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\ (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\ (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\ (costs)[3] += BITS_MVD( bmx+(m3x), bmy+(m3y) );\ } #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ {\ pixel *pix_base = p_fref_w + omx + omy*stride;\ h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ pix_base + (m0x) + (m0y)*stride,\ pix_base + (m1x) + (m1y)*stride,\ pix_base + (m2x) + (m2y)*stride,\ pix_base + (m3x) + (m3y)*stride,\ stride, costs );\ costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\ costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\ costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\ costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\ COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\ COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\ COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\ COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ } #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ {\ h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\ p_fref_w + (m0x) + (m0y)*stride,\ p_fref_w + (m1x) + (m1y)*stride,\ p_fref_w + (m2x) + (m2y)*stride,\ stride, costs );\ costs[0] += p_cost_mvx[(m0x)*4]; /* no cost_mvy */\ costs[1] += p_cost_mvx[(m1x)*4];\ costs[2] += p_cost_mvx[(m2x)*4];\ COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\ COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\ COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\ } /* 1 */ /* 101 */ /* 1 */ #define DIA1_ITER( mx, my )\ {\ omx = mx; omy = my;\ COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\ } #define CROSS( start, x_max, y_max )\ {\ int i = start;\ if( (x_max) <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\ for( ; i < (x_max)-2; i+=4 )\ COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\ for( ; i < (x_max); i+=2 )\ {\ if( omx+i <= mv_x_max )\ COST_MV( omx+i, omy );\ if( omx-i >= mv_x_min )\ COST_MV( omx-i, omy );\ }\ i = start;\ if( (y_max) <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\ for( ; i < (y_max)-2; i+=4 )\ COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\ for( ; i < (y_max); i+=2 )\ {\ if( omy+i <= mv_y_max )\ COST_MV( omx, omy+i );\ if( omy-i >= mv_y_min )\ COST_MV( omx, omy-i );\ }\ } #define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */ #define SPEL(mv) ((mv)*4) /* ... and the reverse. */ #define SPELx2(mv) (SPEL(mv)&0xFFFCFFFC) /* for two packed MVs */ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; const int stride = m->i_stride[0]; int i_me_range = h->param.analyse.i_me_range; int bmx, bmy, bcost = COST_MAX; int bpred_cost = COST_MAX; int omx, omy, pmx, pmy; pixel *p_fenc = m->p_fenc[0]; pixel *p_fref_w = m->p_fref_w; ALIGNED_ARRAY_32( pixel, pix,[16*16] ); ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] ); ALIGNED_ARRAY_16( int, costs,[16] ); int mv_x_min = h->mb.mv_limit_fpel[0][0]; int mv_y_min = h->mb.mv_limit_fpel[0][1]; int mv_x_max = h->mb.mv_limit_fpel[1][0]; int mv_y_max = h->mb.mv_limit_fpel[1][1]; /* Special version of pack to allow shortcuts in CHECK_MVRANGE */ #define pack16to32_mask2(mx,my) (((uint32_t)(mx)<<16)|((uint32_t)(my)&0x7FFF)) uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min ); uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000; uint32_t pmv, bpred_mv = 0; #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000)) const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; /* Try extra predictors if provided. If subme >= 3, check subpel predictors, * otherwise round them to fullpel. */ if( h->mb.i_subpel_refine >= 3 ) { /* Calculate and check the MVP first */ int bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) ); int bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) ); pmv = pack16to32_mask( bpred_mx, bpred_my ); pmx = FPEL( bpred_mx ); pmy = FPEL( bpred_my ); COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost ); int pmv_cost = bpred_cost; if( i_mvc > 0 ) { /* Clip MV candidates and eliminate those equal to zero and pmv. */ int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); if( valid_mvcs > 0 ) { int i = 1, cost; /* We stuff pmv here to branchlessly pick between pmv and the various * MV candidates. [0] gets skipped in order to maintain alignment for * x264_predictor_clip. */ M32( mvc_temp[1] ) = pmv; bpred_cost <<= 4; do { int mx = mvc_temp[i+1][0]; int my = mvc_temp[i+1][1]; COST_MV_HPEL( mx, my, cost ); COPY1_IF_LT( bpred_cost, (cost << 4) + i ); } while( ++i <= valid_mvcs ); bpred_mx = mvc_temp[(bpred_cost&15)+1][0]; bpred_my = mvc_temp[(bpred_cost&15)+1][1]; bpred_cost >>= 4; } } /* Round the best predictor back to fullpel and get the cost, since this is where * we'll be starting the fullpel motion search. */ bmx = FPEL( bpred_mx ); bmy = FPEL( bpred_my ); bpred_mv = pack16to32_mask(bpred_mx, bpred_my); if( bpred_mv&0x00030003 ) /* Only test if the tested predictor is actually subpel... */ COST_MV( bmx, bmy ); else /* Otherwise just copy the cost (we already know it) */ bcost = bpred_cost; /* Test the zero vector if it hasn't been tested yet. */ if( pmv ) { if( bmx|bmy ) COST_MV( 0, 0 ); } /* If a subpel mv candidate was better than the zero vector, the previous * fullpel check won't have gotten it even if the pmv was zero. So handle * that possibility here. */ else { COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 ); } } else { /* Calculate and check the fullpel MVP first */ bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max ); bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max ); pmv = pack16to32_mask( bmx, bmy ); /* Because we are rounding the predicted motion vector to fullpel, there will be * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is * chosen as the best predictor, it is often the case that the subpel search will * result in a vector at or next to the predicted motion vector. Therefore, we omit * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of * the predicted motion vector. * * Disclaimer: this is a post-hoc rationalization for why this hack works. */ bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride ); if( i_mvc > 0 ) { /* Like in subme>=3, except we also round the candidates to fullpel. */ int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); if( valid_mvcs > 0 ) { int i = 1, cost; M32( mvc_temp[1] ) = pmv; bcost <<= 4; do { int mx = mvc_temp[i+1][0]; int my = mvc_temp[i+1][1]; cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my ); COPY1_IF_LT( bcost, (cost << 4) + i ); } while( ++i <= valid_mvcs ); bmx = mvc_temp[(bcost&15)+1][0]; bmy = mvc_temp[(bcost&15)+1][1]; bcost >>= 4; } } /* Same as above, except the condition is simpler. */ if( pmv ) COST_MV( 0, 0 ); } switch( h->mb.i_me_method ) { case X264_ME_DIA: { /* diamond search, radius 1 */ bcost <<= 4; int i = i_me_range; do { COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); COPY1_IF_LT( bcost, (costs[1]<<4)+3 ); COPY1_IF_LT( bcost, (costs[2]<<4)+4 ); COPY1_IF_LT( bcost, (costs[3]<<4)+12 ); if( !(bcost&15) ) break; bmx -= (int32_t)((uint32_t)bcost<<28)>>30; bmy -= (int32_t)((uint32_t)bcost<<30)>>30; bcost &= ~15; } while( --i && CHECK_MVRANGE(bmx, bmy) ); bcost >>= 4; break; } case X264_ME_HEX: { me_hex2: /* hexagon search, radius 2 */ #if 0 for( int i = 0; i < i_me_range/2; i++ ) { omx = bmx; omy = bmy; COST_MV( omx-2, omy ); COST_MV( omx-1, omy+2 ); COST_MV( omx+1, omy+2 ); COST_MV( omx+2, omy ); COST_MV( omx+1, omy-2 ); COST_MV( omx-1, omy-2 ); if( bmx == omx && bmy == omy ) break; if( !CHECK_MVRANGE(bmx, bmy) ) break; } #else /* equivalent to the above, but eliminates duplicate candidates */ /* hexagon */ COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs ); COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */ bcost <<= 3; COPY1_IF_LT( bcost, (costs[0]<<3)+2 ); COPY1_IF_LT( bcost, (costs[1]<<3)+3 ); COPY1_IF_LT( bcost, (costs[2]<<3)+4 ); COPY1_IF_LT( bcost, (costs[4]<<3)+5 ); COPY1_IF_LT( bcost, (costs[5]<<3)+6 ); COPY1_IF_LT( bcost, (costs[6]<<3)+7 ); if( bcost&7 ) { int dir = (bcost&7)-2; bmx += hex2[dir+1][0]; bmy += hex2[dir+1][1]; /* half hexagon, not overlapping the previous iteration */ for( int i = (i_me_range>>1) - 1; i > 0 && CHECK_MVRANGE(bmx, bmy); i-- ) { COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1], hex2[dir+1][0], hex2[dir+1][1], hex2[dir+2][0], hex2[dir+2][1], costs ); bcost &= ~7; COPY1_IF_LT( bcost, (costs[0]<<3)+1 ); COPY1_IF_LT( bcost, (costs[1]<<3)+2 ); COPY1_IF_LT( bcost, (costs[2]<<3)+3 ); if( !(bcost&7) ) break; dir += (bcost&7)-2; dir = mod6m1[dir+1]; bmx += hex2[dir+1][0]; bmy += hex2[dir+1][1]; } } bcost >>= 3; #endif /* square refine */ bcost <<= 4; COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); COPY1_IF_LT( bcost, (costs[1]<<4)+2 ); COPY1_IF_LT( bcost, (costs[2]<<4)+3 ); COPY1_IF_LT( bcost, (costs[3]<<4)+4 ); COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs ); COPY1_IF_LT( bcost, (costs[0]<<4)+5 ); COPY1_IF_LT( bcost, (costs[1]<<4)+6 ); COPY1_IF_LT( bcost, (costs[2]<<4)+7 ); COPY1_IF_LT( bcost, (costs[3]<<4)+8 ); bmx += square1[bcost&15][0]; bmy += square1[bcost&15][1]; bcost >>= 4; break; } case X264_ME_UMH: { /* Uneven-cross Multi-Hexagon-grid Search * as in JM, except with different early termination */ static const uint8_t pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; int ucost1, ucost2; int cross_start = 1; /* refine predictors */ ucost1 = bcost; DIA1_ITER( pmx, pmy ); if( pmx | pmy ) DIA1_ITER( 0, 0 ); if( i_pixel == PIXEL_4x4 ) goto me_hex2; ucost2 = bcost; if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) ) DIA1_ITER( bmx, bmy ); if( bcost == ucost2 ) cross_start = 3; omx = bmx; omy = bmy; /* early termination */ #define SAD_THRESH(v) ( bcost < ( v >> pixel_size_shift[i_pixel] ) ) if( bcost == ucost2 && SAD_THRESH(2000) ) { COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 ); COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 ); if( bcost == ucost1 && SAD_THRESH(500) ) break; if( bcost == ucost2 ) { int range = (i_me_range>>1) | 1; CROSS( 3, range, range ); COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 ); COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 ); if( bcost == ucost2 ) break; cross_start = range + 2; } } /* adaptive search range */ if( i_mvc ) { /* range multipliers based on casual inspection of some statistics of * average distance between current predictor and final mv found by ESA. * these have not been tuned much by actual encoding. */ static const uint8_t range_mul[4][4] = { { 3, 3, 4, 4 }, { 3, 4, 4, 4 }, { 4, 4, 4, 5 }, { 4, 4, 5, 6 }, }; int mvd; int sad_ctx, mvd_ctx; int denom = 1; if( i_mvc == 1 ) { if( i_pixel == PIXEL_16x16 ) /* mvc is probably the same as mvp, so the difference isn't meaningful. * but prediction usually isn't too bad, so just use medium range */ mvd = 25; else mvd = abs( m->mvp[0] - mvc[0][0] ) + abs( m->mvp[1] - mvc[0][1] ); } else { /* calculate the degree of agreement between predictors. */ /* in 16x16, mvc includes all the neighbors used to make mvp, * so don't count mvp separately. */ denom = i_mvc - 1; mvd = 0; if( i_pixel != PIXEL_16x16 ) { mvd = abs( m->mvp[0] - mvc[0][0] ) + abs( m->mvp[1] - mvc[0][1] ); denom++; } mvd += x264_predictor_difference( mvc, i_mvc ); } sad_ctx = SAD_THRESH(1000) ? 0 : SAD_THRESH(2000) ? 1 : SAD_THRESH(4000) ? 2 : 3; mvd_ctx = mvd < 10*denom ? 0 : mvd < 20*denom ? 1 : mvd < 40*denom ? 2 : 3; i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2; } /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. * we are still centered on the same place as the DIA2. is this desirable? */ CROSS( cross_start, i_me_range, i_me_range>>1 ); COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 ); /* hexagon grid */ omx = bmx; omy = bmy; const uint16_t *p_cost_omvx = p_cost_mvx + omx*4; const uint16_t *p_cost_omvy = p_cost_mvy + omy*4; int i = 1; do { static const int8_t hex4[16][2] = { { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3}, {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1}, {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1}, {-4, 2}, { 4, 2}, {-2, 3}, { 2, 3}, }; if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min, mv_y_max-omy, omy-mv_y_min ) ) { for( int j = 0; j < 16; j++ ) { int mx = omx + hex4[j][0]*i; int my = omy + hex4[j][1]*i; if( CHECK_MVRANGE(mx, my) ) COST_MV( mx, my ); } } else { int dir = 0; pixel *pix_base = p_fref_w + omx + (omy-4*i)*stride; int dy = i*stride; #define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\ h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ pix_base x0*i+(y0-2*k+4)*dy,\ pix_base x1*i+(y1-2*k+4)*dy,\ pix_base x2*i+(y2-2*k+4)*dy,\ pix_base x3*i+(y3-2*k+4)*dy,\ stride, costs+4*k );\ pix_base += 2*dy; #define ADD_MVCOST(k,x,y) costs[k] += p_cost_omvx[x*4*i] + p_cost_omvy[y*4*i] #define MIN_MV(k,x,y) COPY2_IF_LT( bcost, costs[k], dir, x*16+(y&15) ) SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 ); SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 ); SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 ); SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 ); ADD_MVCOST( 0, 0,-4 ); ADD_MVCOST( 1, 0, 4 ); ADD_MVCOST( 2,-2,-3 ); ADD_MVCOST( 3, 2,-3 ); ADD_MVCOST( 4,-4,-2 ); ADD_MVCOST( 5, 4,-2 ); ADD_MVCOST( 6,-4,-1 ); ADD_MVCOST( 7, 4,-1 ); ADD_MVCOST( 8,-4, 0 ); ADD_MVCOST( 9, 4, 0 ); ADD_MVCOST( 10,-4, 1 ); ADD_MVCOST( 11, 4, 1 ); ADD_MVCOST( 12,-4, 2 ); ADD_MVCOST( 13, 4, 2 ); ADD_MVCOST( 14,-2, 3 ); ADD_MVCOST( 15, 2, 3 ); MIN_MV( 0, 0,-4 ); MIN_MV( 1, 0, 4 ); MIN_MV( 2,-2,-3 ); MIN_MV( 3, 2,-3 ); MIN_MV( 4,-4,-2 ); MIN_MV( 5, 4,-2 ); MIN_MV( 6,-4,-1 ); MIN_MV( 7, 4,-1 ); MIN_MV( 8,-4, 0 ); MIN_MV( 9, 4, 0 ); MIN_MV( 10,-4, 1 ); MIN_MV( 11, 4, 1 ); MIN_MV( 12,-4, 2 ); MIN_MV( 13, 4, 2 ); MIN_MV( 14,-2, 3 ); MIN_MV( 15, 2, 3 ); #undef SADS #undef ADD_MVCOST #undef MIN_MV if( dir ) { bmx = omx + i*(dir>>4); bmy = omy + i*((int32_t)((uint32_t)dir<<28)>>28); } } } while( ++i <= i_me_range>>2 ); if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min ) goto me_hex2; break; } case X264_ME_ESA: case X264_ME_TESA: { const int min_x = X264_MAX( bmx - i_me_range, mv_x_min ); const int min_y = X264_MAX( bmy - i_me_range, mv_y_min ); const int max_x = X264_MIN( bmx + i_me_range, mv_x_max ); const int max_y = X264_MIN( bmy + i_me_range, mv_y_max ); /* SEA is fastest in multiples of 4 */ const int width = (max_x - min_x + 3) & ~3; #if 0 /* plain old exhaustive search */ for( int my = min_y; my <= max_y; my++ ) for( int mx = min_x; mx < min_x + width; mx++ ) COST_MV( mx, my ); #else /* successive elimination by comparing DC before a full SAD, * because sum(abs(diff)) >= abs(diff(sum)). */ uint16_t *sums_base = m->integral; ALIGNED_ARRAY_16( int, enc_dc,[4] ); int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; int delta = x264_pixel_size[sad_size].w; int16_t *xs = h->scratch_buffer; int xn; uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2); h->pixf.sad_x4[sad_size]( (pixel*)x264_zero, p_fenc, p_fenc+delta, p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE, FENC_STRIDE, enc_dc ); if( delta == 4 ) sums_base += stride * (h->fenc->i_lines[0] + PADV*2); if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) delta *= stride; if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) enc_dc[1] = enc_dc[2]; if( h->mb.i_me_method == X264_ME_TESA ) { // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD mvsad_t *mvsads = (mvsad_t *)(xs + ((width+31)&~31) + 4); int nmvsad = 0, limit; int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride ) + BITS_MVD( bmx, bmy ); for( int my = min_y; my <= max_y; my++ ) { int i; int ycost = p_cost_mvy[my*4]; if( bsad <= ycost ) continue; bsad -= ycost; xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, cost_fpel_mvx+min_x, xs, width, bsad * 17 >> 4 ); for( i = 0; i < xn-2; i += 3 ) { pixel *ref = p_fref_w+min_x+my*stride; ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */ h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads ); for( int j = 0; j < 3; j++ ) { int sad = sads[j] + cost_fpel_mvx[xs[i+j]]; if( sad < bsad*sad_thresh>>3 ) { COPY1_IF_LT( bsad, sad ); mvsads[nmvsad].sad = sad + ycost; mvsads[nmvsad].mv[0] = min_x+xs[i+j]; mvsads[nmvsad].mv[1] = my; nmvsad++; } } } for( ; i < xn; i++ ) { int mx = min_x+xs[i]; int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride ) + cost_fpel_mvx[xs[i]]; if( sad < bsad*sad_thresh>>3 ) { COPY1_IF_LT( bsad, sad ); mvsads[nmvsad].sad = sad + ycost; mvsads[nmvsad].mv[0] = mx; mvsads[nmvsad].mv[1] = my; nmvsad++; } } bsad += ycost; } limit = i_me_range >> 1; sad_thresh = bsad*sad_thresh>>3; while( nmvsad > limit*2 && sad_thresh > bsad ) { int i = 0; // halve the range if the domain is too large... eh, close enough sad_thresh = (sad_thresh + bsad) >> 1; while( i < nmvsad && mvsads[i].sad <= sad_thresh ) i++; for( int j = i; j < nmvsad; j++ ) { uint32_t sad; if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 ) { uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] ); #if WORDS_BIGENDIAN mvsad >>= 32; #endif sad = mvsad; } else { sad = mvsads[j].sad; CP32( mvsads[i].mv, mvsads[j].mv ); mvsads[i].sad = sad; } i += (sad - (sad_thresh+1)) >> 31; } nmvsad = i; } while( nmvsad > limit ) { int bi = 0; for( int i = 1; i < nmvsad; i++ ) if( mvsads[i].sad > mvsads[bi].sad ) bi = i; nmvsad--; if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) CP64( &mvsads[bi], &mvsads[nmvsad] ); else mvsads[bi] = mvsads[nmvsad]; } for( int i = 0; i < nmvsad; i++ ) COST_MV( mvsads[i].mv[0], mvsads[i].mv[1] ); } else { // just ADS and SAD for( int my = min_y; my <= max_y; my++ ) { int i; int ycost = p_cost_mvy[my*4]; if( bcost <= ycost ) continue; bcost -= ycost; xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, cost_fpel_mvx+min_x, xs, width, bcost ); for( i = 0; i < xn-2; i += 3 ) COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my ); bcost += ycost; for( ; i < xn; i++ ) COST_MV( min_x+xs[i], my ); } } #endif } break; } /* -> qpel mv */ uint32_t bmv = pack16to32_mask(bmx,bmy); uint32_t bmv_spel = SPELx2(bmv); if( h->mb.i_subpel_refine < 3 ) { m->cost_mv = p_cost_mvx[bmx*4] + p_cost_mvy[bmy*4]; m->cost = bcost; /* compute the real cost */ if( bmv == pmv ) m->cost += m->cost_mv; M32( m->mv ) = bmv_spel; } else { M32(m->mv) = bpred_cost < bcost ? bpred_mv : bmv_spel; m->cost = X264_MIN( bpred_cost, bcost ); } /* subpel refine */ if( h->mb.i_subpel_refine >= 2 ) { int hpel = subpel_iterations[h->mb.i_subpel_refine][2]; int qpel = subpel_iterations[h->mb.i_subpel_refine][3]; refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 ); } } #undef COST_MV void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) { int hpel = subpel_iterations[h->mb.i_subpel_refine][0]; int qpel = subpel_iterations[h->mb.i_subpel_refine][1]; if( m->i_pixel <= PIXEL_8x8 ) m->cost -= m->i_ref_cost; refine_subpel( h, m, hpel, qpel, NULL, 1 ); } void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh ) { refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 ); } #define COST_MV_SAD( mx, my ) \ { \ intptr_t stride = 16; \ pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ } #define COST_MV_SATD( mx, my, dir ) \ if( b_refine_qpel || (dir^1) != odir ) \ { \ intptr_t stride = 16; \ pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ if( CHROMA444 ) \ { \ stride = 16; \ src = h->mc.get_ref( pix, &stride, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \ cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[1], FENC_STRIDE, src, stride ); \ if( cost < bcost ) \ { \ stride = 16; \ src = h->mc.get_ref( pix, &stride, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \ cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[2], FENC_STRIDE, src, stride ); \ } \ } \ else \ { \ h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \ if( m->weight[1].weightfn ) \ m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ if( cost < bcost ) \ { \ if( m->weight[2].weightfn ) \ m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ } \ } \ } \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \ } static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444); int chromapix = h->luma2chroma_pixel[i_pixel]; int chroma_v_shift = CHROMA_V_SHIFT; int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment ALIGNED_ARRAY_16( int, costs,[4] ); int bmx = m->mv[0]; int bmy = m->mv[1]; int bcost = m->cost; int odir = -1, bdir; /* halfpel diamond search */ if( hpel_iters ) { /* try the subpel component of the predicted mv */ if( h->mb.i_subpel_refine < 3 ) { int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 ); int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 ); if( (mx-bmx)|(my-bmy) ) COST_MV_SAD( mx, my ); } bcost <<= 6; for( int i = hpel_iters; i > 0; i-- ) { int omx = bmx, omy = bmy; intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough pixel *src0, *src1, *src2, *src3; src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] ); src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] ); src1 = src0 + stride; src3 = src2 + 1; h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); costs[0] += p_cost_mvx[omx ] + p_cost_mvy[omy-2]; costs[1] += p_cost_mvx[omx ] + p_cost_mvy[omy+2]; costs[2] += p_cost_mvx[omx-2] + p_cost_mvy[omy ]; costs[3] += p_cost_mvx[omx+2] + p_cost_mvy[omy ]; COPY1_IF_LT( bcost, (costs[0]<<6)+2 ); COPY1_IF_LT( bcost, (costs[1]<<6)+6 ); COPY1_IF_LT( bcost, (costs[2]<<6)+16 ); COPY1_IF_LT( bcost, (costs[3]<<6)+48 ); if( !(bcost&63) ) break; bmx -= (int32_t)((uint32_t)bcost<<26)>>29; bmy -= (int32_t)((uint32_t)bcost<<29)>>29; bcost &= ~63; } bcost >>= 6; } if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) ) { bcost = COST_MAX; COST_MV_SATD( bmx, bmy, -1 ); } /* early termination when examining multiple reference frames */ if( p_halfpel_thresh ) { if( (bcost*7)>>3 > *p_halfpel_thresh ) { m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy; // don't need cost_mv return; } else if( bcost < *p_halfpel_thresh ) *p_halfpel_thresh = bcost; } /* quarterpel diamond search */ if( h->mb.i_subpel_refine != 1 ) { bdir = -1; for( int i = qpel_iters; i > 0; i-- ) { if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] ) break; odir = bdir; int omx = bmx, omy = bmy; COST_MV_SATD( omx, omy - 1, 0 ); COST_MV_SATD( omx, omy + 1, 1 ); COST_MV_SATD( omx - 1, omy, 2 ); COST_MV_SATD( omx + 1, omy, 3 ); if( (bmx == omx) & (bmy == omy) ) break; } } /* Special simplified case for subme=1 */ else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] ) { int omx = bmx, omy = bmy; /* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */ h->mc.mc_luma( pix , 64, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] ); h->mc.mc_luma( pix+16, 64, m->p_fref, m->i_stride[0], omx, omy+1, bw, bh, &m->weight[0] ); h->mc.mc_luma( pix+32, 64, m->p_fref, m->i_stride[0], omx-1, omy, bw, bh, &m->weight[0] ); h->mc.mc_luma( pix+48, 64, m->p_fref, m->i_stride[0], omx+1, omy, bw, bh, &m->weight[0] ); h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], pix, pix+16, pix+32, pix+48, 64, costs ); costs[0] += p_cost_mvx[omx ] + p_cost_mvy[omy-1]; costs[1] += p_cost_mvx[omx ] + p_cost_mvy[omy+1]; costs[2] += p_cost_mvx[omx-1] + p_cost_mvy[omy ]; costs[3] += p_cost_mvx[omx+1] + p_cost_mvy[omy ]; bcost <<= 4; COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); COPY1_IF_LT( bcost, (costs[1]<<4)+3 ); COPY1_IF_LT( bcost, (costs[2]<<4)+4 ); COPY1_IF_LT( bcost, (costs[3]<<4)+12 ); bmx -= (int32_t)((uint32_t)bcost<<28)>>30; bmy -= (int32_t)((uint32_t)bcost<<30)>>30; bcost >>= 4; } m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy; m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy]; } #define BIME_CACHE( dx, dy, list )\ {\ x264_me_t *m = m##list;\ int i = 4 + 3*dx + dy;\ int mvx = bm##list##x+dx;\ int mvy = bm##list##y+dy;\ stride[0][list][i] = bw;\ src[0][list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[0][list][i], &m->p_fref[0],\ m->i_stride[0], mvx, mvy, bw, bh, x264_weight_none );\ if( rd )\ {\ if( CHROMA444 )\ {\ stride[1][list][i] = bw;\ src[1][list][i] = h->mc.get_ref( pixu_buf[list][i], &stride[1][list][i], &m->p_fref[4],\ m->i_stride[1], mvx, mvy, bw, bh, x264_weight_none );\ stride[2][list][i] = bw;\ src[2][list][i] = h->mc.get_ref( pixv_buf[list][i], &stride[2][list][i], &m->p_fref[8],\ m->i_stride[2], mvx, mvy, bw, bh, x264_weight_none );\ }\ else if( CHROMA_FORMAT )\ h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\ mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\ }\ } #define SATD_THRESH(cost) (cost+(cost>>4)) /* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this * other than making its iteration count not a compile-time constant. */ #define x264_iter_kludge x264_template(iter_kludge) int x264_iter_kludge = 0; static ALWAYS_INLINE void me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd ) { int x = i8&1; int y = i8>>1; int s8 = X264_SCAN8_0 + 2*x + 16*y; int16_t *cache0_mv = h->mb.cache.mv[0][s8]; int16_t *cache1_mv = h->mb.cache.mv[1][s8]; const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] ); ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] ); ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] ); pixel *src[3][2][9]; int chromapix = h->luma2chroma_pixel[i_pixel]; int chroma_v_shift = CHROMA_V_SHIFT; int chroma_x = (8 >> CHROMA_H_SHIFT) * x; int chroma_y = (8 >> chroma_v_shift) * y; pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE]; pixel *pixu = CHROMA_FORMAT ? &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE] : NULL; pixel *pixv = CHROMA_FORMAT ? &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE] : NULL; int ref0 = h->mb.cache.ref[0][s8]; int ref1 = h->mb.cache.ref[1][s8]; const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; intptr_t stride[3][2][9]; int bm0x = m0->mv[0]; int bm0y = m0->mv[1]; int bm1x = m1->mv[0]; int bm1y = m1->mv[1]; int bcost = COST_MAX; int mc_list0 = 1, mc_list1 = 1; uint64_t bcostrd = COST_MAX64; uint16_t amvd; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ ALIGNED_4( static const int8_t dia4d[33][4] ) = { {0,0,0,0}, {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0}, {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0}, {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0}, {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1}, {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0}, {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0}, {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1}, {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0}, }; if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 || bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 || bm0x < h->mb.mv_min_spel[0] + 8 || bm1x < h->mb.mv_min_spel[0] + 8 || bm0x > h->mb.mv_max_spel[0] - 8 || bm1x > h->mb.mv_max_spel[0] - 8 ) return; if( rd && m0->i_pixel != PIXEL_16x16 && i8 != 0 ) { x264_mb_predict_mv( h, 0, i8<<2, bw>>2, m0->mvp ); x264_mb_predict_mv( h, 1, i8<<2, bw>>2, m1->mvp ); } const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) ); for( int pass = 0; pass < 8; pass++ ) { int bestj = 0; /* check all mv pairs that differ in at most 2 components from the current mvs. */ /* doesn't do chroma ME. this probably doesn't matter, as the gains * from bidir ME are the same with and without chroma ME. */ if( mc_list0 ) for( int j = x264_iter_kludge; j < 9; j++ ) BIME_CACHE( square1[j][0], square1[j][1], 0 ); if( mc_list1 ) for( int j = x264_iter_kludge; j < 9; j++ ) BIME_CACHE( square1[j][0], square1[j][1], 1 ); for( int j = !!pass; j < 33; j++ ) { int m0x = dia4d[j][0] + bm0x; int m0y = dia4d[j][1] + bm0y; int m1x = dia4d[j][2] + bm1x; int m1y = dia4d[j][3] + bm1y; if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) { int i0 = 4 + 3*dia4d[j][0] + dia4d[j][1]; int i1 = 4 + 3*dia4d[j][2] + dia4d[j][3]; visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7)); h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][0][i0], stride[0][0][i0], src[0][1][i1], stride[0][1][i1], i_weight ); int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y]; if( rd ) { if( cost < SATD_THRESH(bcost) ) { bcost = X264_MIN( cost, bcost ); M32( cache0_mv ) = pack16to32_mask(m0x,m0y); M32( cache1_mv ) = pack16to32_mask(m1x,m1y); if( CHROMA444 ) { h->mc.avg[i_pixel]( pixu, FDEC_STRIDE, src[1][0][i0], stride[1][0][i0], src[1][1][i1], stride[1][1][i1], i_weight ); h->mc.avg[i_pixel]( pixv, FDEC_STRIDE, src[2][0][i0], stride[2][0][i0], src[2][1][i1], stride[2][1][i1], i_weight ); } else if( CHROMA_FORMAT ) { h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight ); h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight ); } uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); COPY2_IF_LT( bcostrd, costrd, bestj, j ); } } else COPY2_IF_LT( bcost, cost, bestj, j ); } } if( !bestj ) break; bm0x += dia4d[bestj][0]; bm0y += dia4d[bestj][1]; bm1x += dia4d[bestj][2]; bm1y += dia4d[bestj][3]; mc_list0 = M16( &dia4d[bestj][0] ); mc_list1 = M16( &dia4d[bestj][2] ); } if( rd ) { x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) ); amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) ); x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd ); x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) ); amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) ); x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd ); } m0->mv[0] = bm0x; m0->mv[1] = bm0y; m1->mv[0] = bm1x; m1->mv[1] = bm1y; } void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ) { me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 ); } void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 ) { /* Motion compensation is done as part of bidir_rd; don't repeat * it in encoding. */ h->mb.b_skip_mc = 1; me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 ); h->mb.b_skip_mc = 0; } #undef COST_MV_SATD #define COST_MV_SATD( mx, my, dst, avoid_mvp ) \ { \ if( !avoid_mvp || !(mx == pmx && my == pmy) ) \ { \ h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \ + p_cost_mvx[mx] + p_cost_mvy[my]; \ COPY1_IF_LT( bsatd, dst ); \ } \ else \ dst = COST_MAX; \ } #define COST_MV_RD( mx, my, satd, do_dir, mdir ) \ { \ if( satd <= SATD_THRESH(bsatd) ) \ { \ uint64_t cost; \ M32( cache_mv ) = pack16to32_mask(mx,my); \ if( CHROMA444 ) \ { \ h->mc.mc_luma( pixu, FDEC_STRIDE, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \ h->mc.mc_luma( pixv, FDEC_STRIDE, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \ } \ else if( CHROMA_FORMAT && m->i_pixel <= PIXEL_8x8 ) \ { \ h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \ if( m->weight[1].weightfn ) \ m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \ if( m->weight[2].weightfn ) \ m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \ } \ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ } void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list ) { int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]]; const uint16_t *p_cost_mvx, *p_cost_mvy; const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; int chroma_v_shift = CHROMA_V_SHIFT; int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; uint64_t bcost = COST_MAX64; int bmx = m->mv[0]; int bmy = m->mv[1]; int omx, omy, pmx, pmy; int satd, bsatd; int dir = -2; int i8 = i4>>2; uint16_t amvd; pixel *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]]; pixel *pixu, *pixv; if( CHROMA444 ) { pixu = &h->mb.pic.p_fdec[1][block_idx_xy_fdec[i4]]; pixv = &h->mb.pic.p_fdec[2][block_idx_xy_fdec[i4]]; } else if( CHROMA_FORMAT ) { pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4]; pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4]; } else { pixu = NULL; pixv = NULL; } h->mb.b_skip_mc = 1; if( m->i_pixel != PIXEL_16x16 && i4 != 0 ) x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp ); pmx = m->mvp[0]; pmy = m->mvp[1]; p_cost_mvx = m->p_cost_mv - pmx; p_cost_mvy = m->p_cost_mv - pmy; COST_MV_SATD( bmx, bmy, bsatd, 0 ); if( m->i_pixel != PIXEL_16x16 ) COST_MV_RD( bmx, bmy, 0, 0, 0 ) else bcost = m->cost; /* check the predicted mv */ if( (bmx != pmx || bmy != pmy) && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0] && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] ) { COST_MV_SATD( pmx, pmy, satd, 0 ); COST_MV_RD ( pmx, pmy, satd, 0, 0 ); /* The hex motion search is guaranteed to not repeat the center candidate, * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */ if( bmx == pmx && bmy == pmy ) { pmx = m->mv[0]; pmy = m->mv[1]; } } if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 || bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 ) { h->mb.b_skip_mc = 0; return; } /* subpel hex search, same pattern as ME HEX. */ dir = -2; omx = bmx; omy = bmy; for( int j = 0; j < 6; j++ ) { COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 ); COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j ); } if( dir != -2 ) { /* half hexagon, not overlapping the previous iteration */ for( int i = 1; i < 10; i++ ) { const int odir = mod6m1[dir+1]; if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 ) break; dir = -2; omx = bmx; omy = bmy; for( int j = 0; j < 3; j++ ) { COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 ); COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j ); } if( dir == -2 ) break; } } /* square refine, same pattern as ME HEX. */ omx = bmx; omy = bmy; for( int i = 0; i < 8; i++ ) { COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 ); COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 ); } m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy; x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) ); amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) ); x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd ); h->mb.b_skip_mc = 0; } x264-master/encoder/me.h000066400000000000000000000071771502133446700152640ustar00rootroot00000000000000/***************************************************************************** * me.h: motion estimation ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ENCODER_ME_H #define X264_ENCODER_ME_H #define COST_MAX (1<<28) #define COST_MAX64 (1ULL<<60) typedef struct { /* aligning the first member is a gcc hack to force the struct to be aligned, * as well as force sizeof(struct) to be a multiple of the alignment. */ /* input */ ALIGNED_64( int i_pixel ); /* PIXEL_WxH */ uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */ int i_ref_cost; int i_ref; const x264_weight_t *weight; pixel *p_fref[12]; pixel *p_fref_w; pixel *p_fenc[3]; uint16_t *integral; int i_stride[3]; ALIGNED_4( int16_t mvp[2] ); /* output */ int cost_mv; /* lambda * nbits for the chosen mv */ int cost; /* satd + lambda * nbits */ ALIGNED_8( int16_t mv[2] ); } ALIGNED_64( x264_me_t ); #define x264_me_search_ref x264_template(me_search_ref) void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); #define x264_me_search( h, m, mvc, i_mvc )\ x264_me_search_ref( h, m, mvc, i_mvc, NULL ) #define x264_me_refine_qpel x264_template(me_refine_qpel) void x264_me_refine_qpel( x264_t *h, x264_me_t *m ); #define x264_me_refine_qpel_refdupe x264_template(me_refine_qpel_refdupe) void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh ); #define x264_me_refine_qpel_rd x264_template(me_refine_qpel_rd) void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list ); #define x264_me_refine_bidir_rd x264_template(me_refine_bidir_rd) void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 ); #define x264_me_refine_bidir_satd x264_template(me_refine_bidir_satd) void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); #define x264_rd_cost_part x264_template(rd_cost_part) uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); #define COPY1_IF_LT(x,y)\ if( (y) < (x) )\ (x) = (y); #define COPY2_IF_LT(x,y,a,b)\ if( (y) < (x) )\ {\ (x) = (y);\ (a) = (b);\ } #define COPY3_IF_LT(x,y,a,b,c,d)\ if( (y) < (x) )\ {\ (x) = (y);\ (a) = (b);\ (c) = (d);\ } #define COPY4_IF_LT(x,y,a,b,c,d,e,f)\ if( (y) < (x) )\ {\ (x) = (y);\ (a) = (b);\ (c) = (d);\ (e) = (f);\ } #define COPY2_IF_GT(x,y,a,b)\ if( (y) > (x) )\ {\ (x) = (y);\ (a) = (b);\ } #endif x264-master/encoder/ratecontrol.c000066400000000000000000003621611502133446700172070ustar00rootroot00000000000000/***************************************************************************** * ratecontrol.c: ratecontrol ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * Michael Niedermayer * Gabriel Bouvigne * Fiona Glaser * MÃ¥ns RullgÃ¥rd * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #undef NDEBUG // always check asserts, the speed effect is far too small to disable them #include "common/common.h" #include "ratecontrol.h" #include "me.h" typedef struct { int pict_type; int frame_type; int kept_as_ref; double qscale; int mv_bits; int tex_bits; int misc_bits; double expected_bits; /* total expected bits up to the current frame (current one excluded) */ double expected_vbv; double new_qscale; float new_qp; int i_count; int p_count; int s_count; float blurred_complexity; char direct_mode; int16_t weight[3][2]; int16_t i_weight_denom[2]; int refcount[16]; int refs; int64_t i_duration; int64_t i_cpb_duration; int out_num; } ratecontrol_entry_t; typedef struct { float coeff_min; float coeff; float count; float decay; float offset; } predictor_t; struct x264_ratecontrol_t { /* constants */ int b_abr; int b_2pass; int b_vbv; int b_vbv_min_rate; double fps; double bitrate; double rate_tolerance; double qcompress; int nmb; /* number of macroblocks in a frame */ int qp_constant[3]; /* current frame */ ratecontrol_entry_t *rce; float qpm; /* qp for current macroblock: precise float for AQ */ float qpa_rc; /* average of macroblocks' qp before aq */ float qpa_rc_prev; int qpa_aq; /* average of macroblocks' qp after aq */ int qpa_aq_prev; float qp_novbv; /* QP for the current frame if 1-pass VBV was disabled. */ /* VBV stuff */ double buffer_size; int64_t buffer_fill_final; int64_t buffer_fill_final_min; double buffer_fill; /* planned buffer, if all in-progress frames hit their bit budget */ double buffer_rate; /* # of bits added to buffer_fill after each frame */ double vbv_max_rate; /* # of bits added to buffer_fill per second */ predictor_t *pred; /* predict frame size from satd */ int single_frame_vbv; float rate_factor_max_increment; /* Don't allow RF above (CRF + this value). */ /* ABR stuff */ int last_satd; double last_rceq; double cplxr_sum; /* sum of bits*qscale/rceq */ double expected_bits_sum; /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */ int64_t filler_bits_sum; /* sum in bits of finished frames' filler data */ double wanted_bits_window; /* target bitrate * window */ double cbr_decay; double short_term_cplxsum; double short_term_cplxcount; double rate_factor_constant; double ip_offset; double pb_offset; /* 2pass stuff */ FILE *p_stat_file_out; char *psz_stat_file_tmpname; FILE *p_mbtree_stat_file_out; char *psz_mbtree_stat_file_tmpname; char *psz_mbtree_stat_file_name; FILE *p_mbtree_stat_file_in; int num_entries; /* number of ratecontrol_entry_ts */ ratecontrol_entry_t *entry; /* FIXME: copy needed data and free this once init is done */ ratecontrol_entry_t **entry_out; double last_qscale; double last_qscale_for[3]; /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */ int last_non_b_pict_type; double accum_p_qp; /* for determining I-frame quant */ double accum_p_norm; double last_accum_p_norm; double lmin[3]; /* min qscale by frame type */ double lmax[3]; double lstep; /* max change (multiply) in qscale per frame */ struct { uint16_t *qp_buffer[2]; /* Global buffers for converting MB-tree quantizer data. */ int qpbuf_pos; /* In order to handle pyramid reordering, QP buffer acts as a stack. * This value is the current position (0 or 1). */ int src_mb_count; /* For rescaling */ int rescale_enabled; float *scale_buffer[2]; /* Intermediate buffers */ int filtersize[2]; /* filter size (H/V) */ float *coeffs[2]; int *pos[2]; int srcdim[2]; /* Source dimensions (W/H) */ } mbtree; /* MBRC stuff */ volatile float frame_size_estimated; /* Access to this variable must be atomic: double is * not atomic on all arches we care about */ volatile float bits_so_far; double frame_size_maximum; /* Maximum frame size due to MinCR */ double frame_size_planned; double slice_size_planned; predictor_t *row_pred; predictor_t row_preds[3][2]; predictor_t *pred_b_from_p; /* predict B-frame size from P-frame satd */ int bframes; /* # consecutive B-frames before this P-frame */ int bframe_bits; /* total cost of those frames */ int i_zones; x264_zone_t *zones; x264_zone_t *prev_zone; /* hrd stuff */ int initial_cpb_removal_delay; int initial_cpb_removal_delay_offset; double nrt_first_access_unit; /* nominal removal time */ double previous_cpb_final_arrival_time; uint64_t hrd_multiply_denom; }; static int parse_zones( x264_t *h ); static int init_pass2(x264_t *); static float rate_estimate_qscale( x264_t *h ); static int update_vbv( x264_t *h, int bits ); static void update_vbv_plan( x264_t *h, int overhead ); static float predict_size( predictor_t *p, float q, float var ); static void update_predictor( predictor_t *p, float q, float var, float bits ); #define CMP_OPT_FIRST_PASS( opt, param_val )\ {\ if( ( p = strstr( opts, opt "=" ) ) && sscanf( p, opt "=%d" , &i ) && param_val != i )\ {\ x264_log( h, X264_LOG_ERROR, "different " opt " setting than first pass (%d vs %d)\n", param_val, i );\ return -1;\ }\ } /* Terminology: * qp = h.264's quantizer * qscale = linearized quantizer = Lagrange multiplier */ static inline float qp2qscale( float qp ) { return 0.85f * powf( 2.0f, ( qp - (12.0f + QP_BD_OFFSET) ) / 6.0f ); } static inline float qscale2qp( float qscale ) { return (12.0f + QP_BD_OFFSET) + 6.0f * log2f( qscale/0.85f ); } /* Texture bitrate is not quite inversely proportional to qscale, * probably due the the changing number of SKIP blocks. * MV bits level off at about qp<=12, because the lambda used * for motion estimation is constant there. */ static inline double qscale2bits( ratecontrol_entry_t *rce, double qscale ) { if( qscale<0.1 ) qscale = 0.1; return (rce->tex_bits + .1) * pow( rce->qscale / qscale, 1.1 ) + rce->mv_bits * pow( X264_MAX(rce->qscale, 1) / X264_MAX(qscale, 1), 0.5 ) + rce->misc_bits; } static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i, int b_store ) { uint32_t sum = sum_ssd; uint32_t ssd = sum_ssd >> 32; if( b_store ) { frame->i_pixel_sum[i] += sum; frame->i_pixel_ssd[i] += ssd; } return ssd - ((uint64_t)sum * sum >> shift); } static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store ) { int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16; int stride = frame->i_stride[i]; int offset = b_field ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride : 16 * mb_x + height * mb_y * stride; stride <<= b_field; if( b_chroma ) { ALIGNED_ARRAY_64( pixel, pix,[FENC_STRIDE*16] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int shift = 7 - CHROMA_V_SHIFT; h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height ); return ac_energy_var( h->pixf.var[chromapix]( pix, FENC_STRIDE ), shift, frame, 1, b_store ) + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store ); } else return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store ); } // Find the total AC energy of the block in all planes. static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame ) { /* This function contains annoying hacks because GCC has a habit of reordering emms * and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. */ uint32_t var; x264_prefetch_fenc( h, frame, mb_x, mb_y ); if( h->mb.b_adaptive_mbaff ) { /* We don't know the super-MB mode we're going to pick yet, so * simply try both and pick the lower of the two. */ uint32_t var_interlaced, var_progressive; var_interlaced = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 1, 1 ); var_progressive = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, 0, 0 ); if( CHROMA444 ) { var_interlaced += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 1, 1 ); var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, 0, 0 ); var_interlaced += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 1, 1 ); var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, 0, 0 ); } else if( CHROMA_FORMAT ) { var_interlaced += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 1, 1 ); var_progressive += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, 0, 0 ); } var = X264_MIN( var_interlaced, var_progressive ); } else { var = ac_energy_plane( h, mb_x, mb_y, frame, 0, 0, PARAM_INTERLACED, 1 ); if( CHROMA444 ) { var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 0, PARAM_INTERLACED, 1 ); var += ac_energy_plane( h, mb_x, mb_y, frame, 2, 0, PARAM_INTERLACED, 1 ); } else if( CHROMA_FORMAT ) var += ac_energy_plane( h, mb_x, mb_y, frame, 1, 1, PARAM_INTERLACED, 1 ); } x264_emms(); return var; } void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets ) { /* Initialize frame stats */ for( int i = 0; i < 3; i++ ) { frame->i_pixel_sum[i] = 0; frame->i_pixel_ssd[i] = 0; } /* Degenerate cases */ if( h->param.rc.i_aq_mode == X264_AQ_NONE || h->param.rc.f_aq_strength == 0 ) { /* Need to init it anyways for MB tree */ if( h->param.rc.i_aq_mode && h->param.rc.f_aq_strength == 0 ) { if( quant_offsets ) { for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ ) frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = quant_offsets[mb_xy]; if( h->frames.b_have_lowres ) for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ ) frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8( frame->f_qp_offset[mb_xy] ); } else { memset( frame->f_qp_offset, 0, h->mb.i_mb_count * sizeof(float) ); memset( frame->f_qp_offset_aq, 0, h->mb.i_mb_count * sizeof(float) ); if( h->frames.b_have_lowres ) for( int mb_xy = 0; mb_xy < h->mb.i_mb_count; mb_xy++ ) frame->i_inv_qscale_factor[mb_xy] = 256; } } /* Need variance data for weighted prediction */ if( h->param.analyse.i_weighted_pred ) { for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ ) for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ ) ac_energy_mb( h, mb_x, mb_y, frame ); } else return; } /* Actual adaptive quantization */ else { /* constants chosen to result in approximately the same overall bitrate as without AQ. * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */ float strength; float avg_adj = 0.f; float bias_strength = 0.f; if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE || h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED ) { float bit_depth_correction = 1.f / (1 << (2*(BIT_DEPTH-8))); float avg_adj_pow2 = 0.f; for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ ) for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ ) { uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame ); float qp_adj = powf( energy * bit_depth_correction + 1, 0.125f ); frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj; avg_adj += qp_adj; avg_adj_pow2 += qp_adj * qp_adj; } avg_adj /= h->mb.i_mb_count; avg_adj_pow2 /= h->mb.i_mb_count; strength = h->param.rc.f_aq_strength * avg_adj; avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj; bias_strength = h->param.rc.f_aq_strength; } else strength = h->param.rc.f_aq_strength * 1.0397f; for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ ) for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ ) { float qp_adj; int mb_xy = mb_x + mb_y*h->mb.i_mb_stride; if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED ) { qp_adj = frame->f_qp_offset[mb_xy]; qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 14.f / (qp_adj * qp_adj)); } else if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE ) { qp_adj = frame->f_qp_offset[mb_xy]; qp_adj = strength * (qp_adj - avg_adj); } else { uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame ); qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - (14.427f + 2*(BIT_DEPTH-8))); } if( quant_offsets ) qp_adj += quant_offsets[mb_xy]; frame->f_qp_offset[mb_xy] = frame->f_qp_offset_aq[mb_xy] = qp_adj; if( h->frames.b_have_lowres ) frame->i_inv_qscale_factor[mb_xy] = x264_exp2fix8(qp_adj); } } /* Remove mean from SSD calculation */ for( int i = 0; i < 3; i++ ) { uint64_t ssd = frame->i_pixel_ssd[i]; uint64_t sum = frame->i_pixel_sum[i]; int width = 16*h->mb.i_mb_width >> (i && CHROMA_H_SHIFT); int height = 16*h->mb.i_mb_height >> (i && CHROMA_V_SHIFT); frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height); } } static int macroblock_tree_rescale_init( x264_t *h, x264_ratecontrol_t *rc ) { /* Use fractional QP array dimensions to compensate for edge padding */ float srcdim[2] = {rc->mbtree.srcdim[0] / 16.f, rc->mbtree.srcdim[1] / 16.f}; float dstdim[2] = { h->param.i_width / 16.f, h->param.i_height / 16.f}; int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])}; int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])}; if( h->param.b_interlaced || h->param.b_fake_interlaced ) { srcdimi[1] = (srcdimi[1]+1)&~1; dstdimi[1] = (dstdimi[1]+1)&~1; } rc->mbtree.src_mb_count = srcdimi[0] * srcdimi[1]; CHECKED_MALLOC( rc->mbtree.qp_buffer[0], rc->mbtree.src_mb_count * sizeof(uint16_t) ); if( h->param.i_bframe_pyramid && h->param.rc.b_stat_read ) CHECKED_MALLOC( rc->mbtree.qp_buffer[1], rc->mbtree.src_mb_count * sizeof(uint16_t) ); rc->mbtree.qpbuf_pos = -1; /* No rescaling to do */ if( srcdimi[0] == dstdimi[0] && srcdimi[1] == dstdimi[1] ) return 0; rc->mbtree.rescale_enabled = 1; /* Allocate intermediate scaling buffers */ CHECKED_MALLOC( rc->mbtree.scale_buffer[0], srcdimi[0] * srcdimi[1] * sizeof(float) ); CHECKED_MALLOC( rc->mbtree.scale_buffer[1], dstdimi[0] * srcdimi[1] * sizeof(float) ); /* Allocate and calculate resize filter parameters and coefficients */ for( int i = 0; i < 2; i++ ) { if( srcdim[i] > dstdim[i] ) // downscale rc->mbtree.filtersize[i] = 1 + (2 * srcdimi[i] + dstdimi[i] - 1) / dstdimi[i]; else // upscale rc->mbtree.filtersize[i] = 3; CHECKED_MALLOC( rc->mbtree.coeffs[i], rc->mbtree.filtersize[i] * dstdimi[i] * sizeof(float) ); CHECKED_MALLOC( rc->mbtree.pos[i], dstdimi[i] * sizeof(int) ); /* Initialize filter coefficients */ float inc = srcdim[i] / dstdim[i]; float dmul = inc > 1.f ? dstdim[i] / srcdim[i] : 1.f; float dstinsrc = 0.5f * inc - 0.5f; int filtersize = rc->mbtree.filtersize[i]; for( int j = 0; j < dstdimi[i]; j++ ) { int pos = dstinsrc - (filtersize - 2.f) * 0.5f; float sum = 0.0; rc->mbtree.pos[i][j] = pos; for( int k = 0; k < filtersize; k++ ) { float d = fabs( pos + k - dstinsrc ) * dmul; float coeff = X264_MAX( 1.f - d, 0 ); rc->mbtree.coeffs[i][j * filtersize + k] = coeff; sum += coeff; } sum = 1.0f / sum; for( int k = 0; k < filtersize; k++ ) rc->mbtree.coeffs[i][j * filtersize + k] *= sum; dstinsrc += inc; } } /* Write back actual qp array dimensions */ rc->mbtree.srcdim[0] = srcdimi[0]; rc->mbtree.srcdim[1] = srcdimi[1]; return 0; fail: return -1; } static void macroblock_tree_rescale_destroy( x264_ratecontrol_t *rc ) { for( int i = 0; i < 2; i++ ) { x264_free( rc->mbtree.qp_buffer[i] ); x264_free( rc->mbtree.scale_buffer[i] ); x264_free( rc->mbtree.coeffs[i] ); x264_free( rc->mbtree.pos[i] ); } } static ALWAYS_INLINE float tapfilter( float *src, int pos, int max, int stride, float *coeff, int filtersize ) { float sum = 0.f; for( int i = 0; i < filtersize; i++, pos++ ) sum += src[x264_clip3( pos, 0, max-1 )*stride] * coeff[i]; return sum; } static void macroblock_tree_rescale( x264_t *h, x264_ratecontrol_t *rc, float *dst ) { float *input, *output; int filtersize, stride, height; /* H scale first */ input = rc->mbtree.scale_buffer[0]; output = rc->mbtree.scale_buffer[1]; filtersize = rc->mbtree.filtersize[0]; stride = rc->mbtree.srcdim[0]; height = rc->mbtree.srcdim[1]; for( int y = 0; y < height; y++, input += stride, output += h->mb.i_mb_width ) { float *coeff = rc->mbtree.coeffs[0]; for( int x = 0; x < h->mb.i_mb_width; x++, coeff+=filtersize ) output[x] = tapfilter( input, rc->mbtree.pos[0][x], stride, 1, coeff, filtersize ); } /* V scale next */ input = rc->mbtree.scale_buffer[1]; output = dst; filtersize = rc->mbtree.filtersize[1]; stride = h->mb.i_mb_width; height = rc->mbtree.srcdim[1]; for( int x = 0; x < h->mb.i_mb_width; x++, input++, output++ ) { float *coeff = rc->mbtree.coeffs[1]; for( int y = 0; y < h->mb.i_mb_height; y++, coeff+=filtersize ) output[y*stride] = tapfilter( input, rc->mbtree.pos[1][y], height, stride, coeff, filtersize ); } } int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets ) { x264_ratecontrol_t *rc = h->rc; uint8_t i_type_actual = rc->entry[frame->i_frame].pict_type; if( rc->entry[frame->i_frame].kept_as_ref ) { uint8_t i_type; if( rc->mbtree.qpbuf_pos < 0 ) { do { rc->mbtree.qpbuf_pos++; if( !fread( &i_type, 1, 1, rc->p_mbtree_stat_file_in ) ) goto fail; if( fread( rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], sizeof(uint16_t), rc->mbtree.src_mb_count, rc->p_mbtree_stat_file_in ) != (unsigned)rc->mbtree.src_mb_count ) goto fail; if( i_type != i_type_actual && rc->mbtree.qpbuf_pos == 1 ) { x264_log( h, X264_LOG_ERROR, "MB-tree frametype %d doesn't match actual frametype %d.\n", i_type, i_type_actual ); return -1; } } while( i_type != i_type_actual ); } float *dst = rc->mbtree.rescale_enabled ? rc->mbtree.scale_buffer[0] : frame->f_qp_offset; h->mc.mbtree_fix8_unpack( dst, rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], rc->mbtree.src_mb_count ); if( rc->mbtree.rescale_enabled ) macroblock_tree_rescale( h, rc, frame->f_qp_offset ); if( h->frames.b_have_lowres ) for( int i = 0; i < h->mb.i_mb_count; i++ ) frame->i_inv_qscale_factor[i] = x264_exp2fix8( frame->f_qp_offset[i] ); rc->mbtree.qpbuf_pos--; } else x264_adaptive_quant_frame( h, frame, quant_offsets ); return 0; fail: x264_log( h, X264_LOG_ERROR, "Incomplete MB-tree stats file.\n" ); return -1; } int x264_reference_build_list_optimal( x264_t *h ) { ratecontrol_entry_t *rce = h->rc->rce; x264_frame_t *frames[16]; x264_weight_t weights[16][3]; int refcount[16]; if( rce->refs != h->i_ref[0] ) return -1; memcpy( frames, h->fref[0], sizeof(frames) ); memcpy( refcount, rce->refcount, sizeof(refcount) ); memcpy( weights, h->fenc->weight, sizeof(weights) ); memset( &h->fenc->weight[1][0], 0, sizeof(x264_weight_t[15][3]) ); /* For now don't reorder ref 0; it seems to lower quality in most cases due to skips. */ for( int ref = 1; ref < h->i_ref[0]; ref++ ) { int max = -1; int bestref = 1; for( int i = 1; i < h->i_ref[0]; i++ ) /* Favor lower POC as a tiebreaker. */ COPY2_IF_GT( max, refcount[i], bestref, i ); /* FIXME: If there are duplicates from frames other than ref0 then it is possible * that the optimal ordering doesn't place every duplicate. */ refcount[bestref] = -1; h->fref[0][ref] = frames[bestref]; memcpy( h->fenc->weight[ref], weights[bestref], sizeof(weights[bestref]) ); } return 0; } static char *strcat_filename( char *input, char *suffix ) { char *output = x264_malloc( strlen( input ) + strlen( suffix ) + 1 ); if( !output ) return NULL; strcpy( output, input ); strcat( output, suffix ); return output; } void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init ) { x264_ratecontrol_t *rc = h->rc; if( !b_init && rc->b_2pass ) return; if( h->param.rc.i_rc_method == X264_RC_CRF ) { /* Arbitrary rescaling to make CRF somewhat similar to QP. * Try to compensate for MB-tree's effects as well. */ double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80); double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0; rc->rate_factor_constant = pow( base_cplx, 1 - rc->qcompress ) / qp2qscale( h->param.rc.f_rf_constant + mbtree_offset + QP_BD_OFFSET ); } if( h->param.rc.i_vbv_max_bitrate > 0 && h->param.rc.i_vbv_buffer_size > 0 ) { /* We don't support changing the ABR bitrate right now, so if the stream starts as CBR, keep it CBR. */ if( rc->b_vbv_min_rate ) h->param.rc.i_vbv_max_bitrate = h->param.rc.i_bitrate; if( h->param.rc.i_vbv_buffer_size < (int)(h->param.rc.i_vbv_max_bitrate / rc->fps) ) { h->param.rc.i_vbv_buffer_size = h->param.rc.i_vbv_max_bitrate / rc->fps; x264_log( h, X264_LOG_WARNING, "VBV buffer size cannot be smaller than one frame, using %d kbit\n", h->param.rc.i_vbv_buffer_size ); } int kilobit_size = h->param.i_avcintra_class ? 1024 : 1000; int vbv_buffer_size = h->param.rc.i_vbv_buffer_size * kilobit_size; int vbv_max_bitrate = h->param.rc.i_vbv_max_bitrate * kilobit_size; /* Init HRD */ if( h->param.i_nal_hrd && b_init ) { h->sps->vui.hrd.i_cpb_cnt = 1; h->sps->vui.hrd.b_cbr_hrd = h->param.i_nal_hrd == X264_NAL_HRD_CBR; h->sps->vui.hrd.i_time_offset_length = 0; #define BR_SHIFT 6 #define CPB_SHIFT 4 // normalize HRD size and rate to the value / scale notation h->sps->vui.hrd.i_bit_rate_scale = x264_clip3( x264_ctz( vbv_max_bitrate ) - BR_SHIFT, 0, 15 ); h->sps->vui.hrd.i_bit_rate_value = vbv_max_bitrate >> ( h->sps->vui.hrd.i_bit_rate_scale + BR_SHIFT ); h->sps->vui.hrd.i_bit_rate_unscaled = h->sps->vui.hrd.i_bit_rate_value << ( h->sps->vui.hrd.i_bit_rate_scale + BR_SHIFT ); h->sps->vui.hrd.i_cpb_size_scale = x264_clip3( x264_ctz( vbv_buffer_size ) - CPB_SHIFT, 0, 15 ); h->sps->vui.hrd.i_cpb_size_value = vbv_buffer_size >> ( h->sps->vui.hrd.i_cpb_size_scale + CPB_SHIFT ); h->sps->vui.hrd.i_cpb_size_unscaled = h->sps->vui.hrd.i_cpb_size_value << ( h->sps->vui.hrd.i_cpb_size_scale + CPB_SHIFT ); #undef CPB_SHIFT #undef BR_SHIFT // arbitrary #define MAX_DURATION 0.5 int max_cpb_output_delay = X264_MIN( h->param.i_keyint_max * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick, INT_MAX ); int max_dpb_output_delay = h->sps->vui.i_max_dec_frame_buffering * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick; int max_delay = (int)(90000.0 * (double)h->sps->vui.hrd.i_cpb_size_unscaled / h->sps->vui.hrd.i_bit_rate_unscaled + 0.5); h->sps->vui.hrd.i_initial_cpb_removal_delay_length = 2 + x264_clip3( 32 - x264_clz( max_delay ), 4, 22 ); h->sps->vui.hrd.i_cpb_removal_delay_length = x264_clip3( 32 - x264_clz( max_cpb_output_delay ), 4, 31 ); h->sps->vui.hrd.i_dpb_output_delay_length = x264_clip3( 32 - x264_clz( max_dpb_output_delay ), 4, 31 ); #undef MAX_DURATION vbv_buffer_size = h->sps->vui.hrd.i_cpb_size_unscaled; vbv_max_bitrate = h->sps->vui.hrd.i_bit_rate_unscaled; } else if( h->param.i_nal_hrd && !b_init ) { x264_log( h, X264_LOG_WARNING, "VBV parameters cannot be changed when NAL HRD is in use\n" ); return; } h->sps->vui.hrd.i_bit_rate_unscaled = vbv_max_bitrate; h->sps->vui.hrd.i_cpb_size_unscaled = vbv_buffer_size; if( rc->b_vbv_min_rate ) rc->bitrate = (double)h->param.rc.i_bitrate * kilobit_size; rc->buffer_rate = vbv_max_bitrate / rc->fps; rc->vbv_max_rate = vbv_max_bitrate; rc->buffer_size = vbv_buffer_size; rc->single_frame_vbv = rc->buffer_rate * 1.1 > rc->buffer_size; if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR ) rc->cbr_decay = 1.0 - rc->buffer_rate / rc->buffer_size * 0.5 * X264_MAX(0, 1.5 - rc->buffer_rate * rc->fps / rc->bitrate); if( h->param.rc.i_rc_method == X264_RC_CRF && h->param.rc.f_rf_constant_max ) { rc->rate_factor_max_increment = h->param.rc.f_rf_constant_max - h->param.rc.f_rf_constant; if( rc->rate_factor_max_increment <= 0 ) { x264_log( h, X264_LOG_WARNING, "CRF max must be greater than CRF\n" ); rc->rate_factor_max_increment = 0; } } if( b_init ) { if( h->param.rc.f_vbv_buffer_init > 1. ) h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 ); h->param.rc.f_vbv_buffer_init = x264_clip3f( X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size ), 0, 1); rc->buffer_fill_final = rc->buffer_fill_final_min = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale; rc->b_vbv = 1; rc->b_vbv_min_rate = !rc->b_2pass && h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.i_vbv_max_bitrate <= h->param.rc.i_bitrate; } } } int x264_ratecontrol_new( x264_t *h ) { x264_ratecontrol_t *rc; x264_emms(); CHECKED_MALLOCZERO( h->rc, h->param.i_threads * sizeof(x264_ratecontrol_t) ); rc = h->rc; rc->b_abr = h->param.rc.i_rc_method != X264_RC_CQP && !h->param.rc.b_stat_read; rc->b_2pass = h->param.rc.i_rc_method == X264_RC_ABR && h->param.rc.b_stat_read; /* FIXME: use integers */ if( h->param.i_fps_num > 0 && h->param.i_fps_den > 0 ) rc->fps = (float) h->param.i_fps_num / h->param.i_fps_den; else rc->fps = 25.0; if( h->param.rc.b_mb_tree ) { h->param.rc.f_pb_factor = 1; rc->qcompress = 1; } else rc->qcompress = h->param.rc.f_qcompress; rc->bitrate = h->param.rc.i_bitrate * (h->param.i_avcintra_class ? 1024. : 1000.); rc->rate_tolerance = h->param.rc.f_rate_tolerance; rc->nmb = h->mb.i_mb_count; rc->last_non_b_pict_type = -1; rc->cbr_decay = 1.0; if( h->param.rc.i_rc_method != X264_RC_ABR && h->param.rc.b_stat_read ) { x264_log( h, X264_LOG_ERROR, "CRF/CQP is incompatible with 2pass.\n" ); return -1; } x264_ratecontrol_init_reconfigurable( h, 1 ); if( h->param.i_nal_hrd ) { uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale; uint64_t num = 90000; x264_reduce_fraction64( &num, &denom ); rc->hrd_multiply_denom = 90000 / num; double bits_required = log2( num ) + log2( h->sps->vui.i_time_scale ) + log2( h->sps->vui.hrd.i_cpb_size_unscaled ); if( bits_required >= 63 ) { x264_log( h, X264_LOG_ERROR, "HRD with very large timescale and bufsize not supported\n" ); return -1; } } if( rc->rate_tolerance < 0.01 ) { x264_log( h, X264_LOG_WARNING, "bitrate tolerance too small, using .01\n" ); rc->rate_tolerance = 0.01; } h->mb.b_variable_qp = rc->b_vbv || h->param.rc.i_aq_mode; if( rc->b_abr ) { /* FIXME ABR_INIT_QP is actually used only in CRF */ #define ABR_INIT_QP (( h->param.rc.i_rc_method == X264_RC_CRF ? h->param.rc.f_rf_constant : 24 ) + QP_BD_OFFSET) rc->accum_p_norm = .01; rc->accum_p_qp = ABR_INIT_QP * rc->accum_p_norm; /* estimated ratio that produces a reasonable QP for the first I-frame */ rc->cplxr_sum = .01 * pow( 7.0e5, rc->qcompress ) * pow( h->mb.i_mb_count, 0.5 ); rc->wanted_bits_window = 1.0 * rc->bitrate / rc->fps; rc->last_non_b_pict_type = SLICE_TYPE_I; } rc->ip_offset = 6.0 * log2f( h->param.rc.f_ip_factor ); rc->pb_offset = 6.0 * log2f( h->param.rc.f_pb_factor ); rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant; rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, QP_MAX ); rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, QP_MAX ); h->mb.ip_offset = rc->ip_offset + 0.5; rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 ); rc->last_qscale = qp2qscale( 26 + QP_BD_OFFSET ); int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1; CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds ); CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) ); static const float pred_coeff_table[3] = { 1.0, 1.0, 1.5 }; for( int i = 0; i < 3; i++ ) { rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP ); rc->lmin[i] = qp2qscale( h->param.rc.i_qp_min ); rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max ); for( int j = 0; j < num_preds; j++ ) { rc->pred[i+j*5].coeff_min = pred_coeff_table[i] / 2; rc->pred[i+j*5].coeff = pred_coeff_table[i]; rc->pred[i+j*5].count = 1.0; rc->pred[i+j*5].decay = 0.5; rc->pred[i+j*5].offset = 0.0; } for( int j = 0; j < 2; j++ ) { rc->row_preds[i][j].coeff_min = .25 / 4; rc->row_preds[i][j].coeff = .25; rc->row_preds[i][j].count = 1.0; rc->row_preds[i][j].decay = 0.5; rc->row_preds[i][j].offset = 0.0; } } rc->pred_b_from_p->coeff_min = 0.5 / 2; rc->pred_b_from_p->coeff = 0.5; rc->pred_b_from_p->count = 1.0; rc->pred_b_from_p->decay = 0.5; rc->pred_b_from_p->offset = 0.0; if( parse_zones( h ) < 0 ) { x264_log( h, X264_LOG_ERROR, "failed to parse zones\n" ); return -1; } /* Load stat file and init 2pass algo */ if( h->param.rc.b_stat_read ) { char *p, *stats_in, *stats_buf; /* read 1st pass stats */ assert( h->param.rc.psz_stat_in ); stats_buf = stats_in = x264_slurp_file( h->param.rc.psz_stat_in ); if( !stats_buf ) { x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n" ); return -1; } if( h->param.rc.b_mb_tree ) { char *mbtree_stats_in = strcat_filename( h->param.rc.psz_stat_in, ".mbtree" ); if( !mbtree_stats_in ) return -1; rc->p_mbtree_stat_file_in = x264_fopen( mbtree_stats_in, "rb" ); x264_free( mbtree_stats_in ); if( !rc->p_mbtree_stat_file_in ) { x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n" ); return -1; } } /* check whether 1st pass options were compatible with current options */ if( strncmp( stats_buf, "#options:", 9 ) ) { x264_log( h, X264_LOG_ERROR, "options list in stats file not valid\n" ); return -1; } float res_factor, res_factor_bits; { int i, j; uint32_t k, l; char *opts = stats_buf; stats_in = strchr( stats_buf, '\n' ); if( !stats_in ) return -1; *stats_in = '\0'; stats_in++; if( sscanf( opts, "#options: %dx%d", &i, &j ) != 2 ) { x264_log( h, X264_LOG_ERROR, "resolution specified in stats file not valid\n" ); return -1; } else if( h->param.rc.b_mb_tree ) { rc->mbtree.srcdim[0] = i; rc->mbtree.srcdim[1] = j; } res_factor = (float)h->param.i_width * h->param.i_height / (i*j); /* Change in bits relative to resolution isn't quite linear on typical sources, * so we'll at least try to roughly approximate this effect. */ res_factor_bits = powf( res_factor, 0.7 ); if( !( p = strstr( opts, "timebase=" ) ) || sscanf( p, "timebase=%u/%u", &k, &l ) != 2 ) { x264_log( h, X264_LOG_ERROR, "timebase specified in stats file not valid\n" ); return -1; } if( k != h->param.i_timebase_num || l != h->param.i_timebase_den ) { x264_log( h, X264_LOG_ERROR, "timebase mismatch with 1st pass (%u/%u vs %u/%u)\n", h->param.i_timebase_num, h->param.i_timebase_den, k, l ); return -1; } CMP_OPT_FIRST_PASS( "bitdepth", BIT_DEPTH ); CMP_OPT_FIRST_PASS( "weightp", X264_MAX( 0, h->param.analyse.i_weighted_pred ) ); CMP_OPT_FIRST_PASS( "bframes", h->param.i_bframe ); CMP_OPT_FIRST_PASS( "b_pyramid", h->param.i_bframe_pyramid ); CMP_OPT_FIRST_PASS( "intra_refresh", h->param.b_intra_refresh ); CMP_OPT_FIRST_PASS( "open_gop", h->param.b_open_gop ); CMP_OPT_FIRST_PASS( "bluray_compat", h->param.b_bluray_compat ); CMP_OPT_FIRST_PASS( "mbtree", h->param.rc.b_mb_tree ); if( (p = strstr( opts, "interlaced=" )) ) { char *current = h->param.b_interlaced ? h->param.b_tff ? "tff" : "bff" : h->param.b_fake_interlaced ? "fake" : "0"; char buf[5]; sscanf( p, "interlaced=%4s", buf ); if( strcmp( current, buf ) ) { x264_log( h, X264_LOG_ERROR, "different interlaced setting than first pass (%s vs %s)\n", current, buf ); return -1; } } if( (p = strstr( opts, "keyint=" )) ) { p += 7; char buf[13] = "infinite "; if( h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE ) sprintf( buf, "%d ", h->param.i_keyint_max ); if( strncmp( p, buf, strlen(buf) ) ) { x264_log( h, X264_LOG_ERROR, "different keyint setting than first pass (%.*s vs %.*s)\n", strlen(buf)-1, buf, strcspn(p, " "), p ); return -1; } } if( strstr( opts, "qp=0" ) && h->param.rc.i_rc_method == X264_RC_ABR ) x264_log( h, X264_LOG_WARNING, "1st pass was lossless, bitrate prediction will be inaccurate\n" ); if( !strstr( opts, "direct=3" ) && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ) { x264_log( h, X264_LOG_WARNING, "direct=auto not used on the first pass\n" ); h->mb.b_direct_auto_write = 1; } if( ( p = strstr( opts, "b_adapt=" ) ) && sscanf( p, "b_adapt=%d", &i ) && i >= X264_B_ADAPT_NONE && i <= X264_B_ADAPT_TRELLIS ) h->param.i_bframe_adaptive = i; else if( h->param.i_bframe ) { x264_log( h, X264_LOG_ERROR, "b_adapt method specified in stats file not valid\n" ); return -1; } if( (h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size) && ( p = strstr( opts, "rc_lookahead=" ) ) && sscanf( p, "rc_lookahead=%d", &i ) ) h->param.rc.i_lookahead = i; } /* find number of pics */ p = stats_in; int num_entries; for( num_entries = -1; p; num_entries++ ) p = strchr( p + 1, ';' ); if( !num_entries ) { x264_log( h, X264_LOG_ERROR, "empty stats file\n" ); return -1; } rc->num_entries = num_entries; if( h->param.i_frame_total < rc->num_entries && h->param.i_frame_total > 0 ) { x264_log( h, X264_LOG_WARNING, "2nd pass has fewer frames than 1st pass (%d vs %d)\n", h->param.i_frame_total, rc->num_entries ); } if( h->param.i_frame_total > rc->num_entries ) { x264_log( h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d vs %d)\n", h->param.i_frame_total, rc->num_entries ); return -1; } CHECKED_MALLOCZERO( rc->entry, rc->num_entries * sizeof(ratecontrol_entry_t) ); CHECKED_MALLOC( rc->entry_out, rc->num_entries * sizeof(ratecontrol_entry_t*) ); /* init all to skipped p frames */ for( int i = 0; i < rc->num_entries; i++ ) { ratecontrol_entry_t *rce = &rc->entry[i]; rce->pict_type = SLICE_TYPE_P; rce->qscale = rce->new_qscale = qp2qscale( 20 + QP_BD_OFFSET ); rce->misc_bits = rc->nmb + 10; rce->new_qp = 0; rc->entry_out[i] = rce; } /* read stats */ p = stats_in; double total_qp_aq = 0; for( int i = 0; i < rc->num_entries; i++ ) { ratecontrol_entry_t *rce; int frame_number = 0; int frame_out_number = 0; char pict_type = 0; int e; char *next; float qp_rc, qp_aq; int ref; next= strchr(p, ';'); if( next ) *next++ = 0; //sscanf is unbelievably slow on long strings e = sscanf( p, " in:%d out:%d ", &frame_number, &frame_out_number ); if( frame_number < 0 || frame_number >= rc->num_entries ) { x264_log( h, X264_LOG_ERROR, "bad frame number (%d) at stats line %d\n", frame_number, i ); return -1; } if( frame_out_number < 0 || frame_out_number >= rc->num_entries ) { x264_log( h, X264_LOG_ERROR, "bad frame output number (%d) at stats line %d\n", frame_out_number, i ); return -1; } rce = &rc->entry[frame_number]; rc->entry_out[frame_out_number] = rce; rce->direct_mode = 0; e += sscanf( p, " in:%*d out:%*d type:%c dur:%"SCNd64" cpbdur:%"SCNd64" q:%f aq:%f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c", &pict_type, &rce->i_duration, &rce->i_cpb_duration, &qp_rc, &qp_aq, &rce->tex_bits, &rce->mv_bits, &rce->misc_bits, &rce->i_count, &rce->p_count, &rce->s_count, &rce->direct_mode ); rce->tex_bits *= res_factor_bits; rce->mv_bits *= res_factor_bits; rce->misc_bits *= res_factor_bits; rce->i_count *= res_factor; rce->p_count *= res_factor; rce->s_count *= res_factor; p = strstr( p, "ref:" ); if( !p ) goto parse_error; p += 4; for( ref = 0; ref < 16; ref++ ) { if( sscanf( p, " %d", &rce->refcount[ref] ) != 1 ) break; p = strchr( p+1, ' ' ); if( !p ) goto parse_error; } rce->refs = ref; /* find weights */ rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1; char *w = strchr( p, 'w' ); if( w ) { int count = sscanf( w, "w:%hd,%hd,%hd,%hd,%hd,%hd,%hd,%hd", &rce->i_weight_denom[0], &rce->weight[0][0], &rce->weight[0][1], &rce->i_weight_denom[1], &rce->weight[1][0], &rce->weight[1][1], &rce->weight[2][0], &rce->weight[2][1] ); if( count == 3 ) rce->i_weight_denom[1] = -1; else if( count != 8 ) rce->i_weight_denom[0] = rce->i_weight_denom[1] = -1; } if( pict_type != 'b' ) rce->kept_as_ref = 1; switch( pict_type ) { case 'I': rce->frame_type = X264_TYPE_IDR; rce->pict_type = SLICE_TYPE_I; break; case 'i': rce->frame_type = X264_TYPE_I; rce->pict_type = SLICE_TYPE_I; break; case 'P': rce->frame_type = X264_TYPE_P; rce->pict_type = SLICE_TYPE_P; break; case 'B': rce->frame_type = X264_TYPE_BREF; rce->pict_type = SLICE_TYPE_B; break; case 'b': rce->frame_type = X264_TYPE_B; rce->pict_type = SLICE_TYPE_B; break; default: e = -1; break; } if( e < 14 ) { parse_error: x264_log( h, X264_LOG_ERROR, "statistics are damaged at line %d, parser out=%d\n", i, e ); return -1; } rce->qscale = qp2qscale( qp_rc ); total_qp_aq += qp_aq; p = next; } if( !h->param.b_stitchable ) h->pps->i_pic_init_qp = SPEC_QP( (int)(total_qp_aq / rc->num_entries + 0.5) ); x264_free( stats_buf ); if( h->param.rc.i_rc_method == X264_RC_ABR ) { if( init_pass2( h ) < 0 ) return -1; } /* else we're using constant quant, so no need to run the bitrate allocation */ } /* Open output file */ /* If input and output files are the same, output to a temp file * and move it to the real name only when it's complete */ if( h->param.rc.b_stat_write ) { char *p; rc->psz_stat_file_tmpname = strcat_filename( h->param.rc.psz_stat_out, ".temp" ); if( !rc->psz_stat_file_tmpname ) return -1; rc->p_stat_file_out = x264_fopen( rc->psz_stat_file_tmpname, "wb" ); if( rc->p_stat_file_out == NULL ) { x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open stats file\n" ); return -1; } p = x264_param2string( &h->param, 1 ); if( p ) fprintf( rc->p_stat_file_out, "#options: %s\n", p ); x264_free( p ); if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read ) { rc->psz_mbtree_stat_file_tmpname = strcat_filename( h->param.rc.psz_stat_out, ".mbtree.temp" ); rc->psz_mbtree_stat_file_name = strcat_filename( h->param.rc.psz_stat_out, ".mbtree" ); if( !rc->psz_mbtree_stat_file_tmpname || !rc->psz_mbtree_stat_file_name ) return -1; rc->p_mbtree_stat_file_out = x264_fopen( rc->psz_mbtree_stat_file_tmpname, "wb" ); if( rc->p_mbtree_stat_file_out == NULL ) { x264_log( h, X264_LOG_ERROR, "ratecontrol_init: can't open mbtree stats file\n" ); return -1; } } } if( h->param.rc.b_mb_tree && (h->param.rc.b_stat_read || h->param.rc.b_stat_write) ) { if( !h->param.rc.b_stat_read ) { rc->mbtree.srcdim[0] = h->param.i_width; rc->mbtree.srcdim[1] = h->param.i_height; } if( macroblock_tree_rescale_init( h, rc ) < 0 ) return -1; } for( int i = 0; iparam.i_threads; i++ ) { h->thread[i]->rc = rc+i; if( i ) { rc[i] = rc[0]; h->thread[i]->param = h->param; h->thread[i]->mb.b_variable_qp = h->mb.b_variable_qp; h->thread[i]->mb.ip_offset = h->mb.ip_offset; } } return 0; fail: return -1; } static int parse_zone( x264_t *h, x264_zone_t *z, char *p ) { int len = 0; char *tok, UNUSED *saveptr=NULL; z->param = NULL; z->f_bitrate_factor = 1; if( 3 <= sscanf(p, "%d,%d,q=%d%n", &z->i_start, &z->i_end, &z->i_qp, &len) ) z->b_force_qp = 1; else if( 3 <= sscanf(p, "%d,%d,b=%f%n", &z->i_start, &z->i_end, &z->f_bitrate_factor, &len) ) z->b_force_qp = 0; else if( 2 <= sscanf(p, "%d,%d%n", &z->i_start, &z->i_end, &len) ) z->b_force_qp = 0; else { x264_log( h, X264_LOG_ERROR, "invalid zone: \"%s\"\n", p ); return -1; } p += len; if( !*p ) return 0; CHECKED_MALLOC( z->param, sizeof(x264_param_t) ); memcpy( z->param, &h->param, sizeof(x264_param_t) ); z->param->opaque = NULL; z->param->param_free = x264_free; while( (tok = strtok_r( p, ",", &saveptr )) ) { char *val = strchr( tok, '=' ); if( val ) { *val = '\0'; val++; } if( x264_param_parse( z->param, tok, val ) ) { x264_log( h, X264_LOG_ERROR, "invalid zone param: %s = %s\n", tok, val ); return -1; } p = NULL; } return 0; fail: return -1; } static int parse_zones( x264_t *h ) { x264_ratecontrol_t *rc = h->rc; if( h->param.rc.psz_zones && !h->param.rc.i_zones ) { char *psz_zones, *p; CHECKED_MALLOC( psz_zones, strlen( h->param.rc.psz_zones )+1 ); strcpy( psz_zones, h->param.rc.psz_zones ); h->param.rc.i_zones = 1; for( p = psz_zones; *p; p++ ) h->param.rc.i_zones += (*p == '/'); CHECKED_MALLOC( h->param.rc.zones, h->param.rc.i_zones * sizeof(x264_zone_t) ); p = psz_zones; for( int i = 0; i < h->param.rc.i_zones; i++ ) { int i_tok = strcspn( p, "/" ); p[i_tok] = 0; if( parse_zone( h, &h->param.rc.zones[i], p ) ) { x264_free( psz_zones ); return -1; } p += i_tok + 1; } x264_free( psz_zones ); } if( h->param.rc.i_zones > 0 ) { for( int i = 0; i < h->param.rc.i_zones; i++ ) { x264_zone_t z = h->param.rc.zones[i]; if( z.i_start < 0 || z.i_start > z.i_end ) { x264_log( h, X264_LOG_ERROR, "invalid zone: start=%d end=%d\n", z.i_start, z.i_end ); return -1; } else if( !z.b_force_qp && z.f_bitrate_factor <= 0 ) { x264_log( h, X264_LOG_ERROR, "invalid zone: bitrate_factor=%f\n", z.f_bitrate_factor ); return -1; } } rc->i_zones = h->param.rc.i_zones + 1; CHECKED_MALLOC( rc->zones, rc->i_zones * sizeof(x264_zone_t) ); memcpy( rc->zones+1, h->param.rc.zones, (rc->i_zones-1) * sizeof(x264_zone_t) ); // default zone to fall back to if none of the others match rc->zones[0].i_start = 0; rc->zones[0].i_end = INT_MAX; rc->zones[0].b_force_qp = 0; rc->zones[0].f_bitrate_factor = 1; CHECKED_MALLOC( rc->zones[0].param, sizeof(x264_param_t) ); memcpy( rc->zones[0].param, &h->param, sizeof(x264_param_t) ); rc->zones[0].param->opaque = NULL; for( int i = 1; i < rc->i_zones; i++ ) { if( !rc->zones[i].param ) rc->zones[i].param = rc->zones[0].param; } } return 0; fail: return -1; } static x264_zone_t *get_zone( x264_t *h, int frame_num ) { x264_ratecontrol_t *rc = h->rc; for( int i = rc->i_zones - 1; i >= 0; i-- ) { x264_zone_t *z = &rc->zones[i]; if( frame_num >= z->i_start && frame_num <= z->i_end ) return z; } return NULL; } void x264_ratecontrol_summary( x264_t *h ) { x264_ratecontrol_t *rc = h->rc; if( rc->b_abr && h->param.rc.i_rc_method == X264_RC_ABR && rc->cbr_decay > .9999 ) { double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80); double mbtree_offset = h->param.rc.b_mb_tree ? (1.0-h->param.rc.f_qcompress)*13.5 : 0; x264_log( h, X264_LOG_INFO, "final ratefactor: %.2f\n", qscale2qp( pow( base_cplx, 1 - rc->qcompress ) * rc->cplxr_sum / rc->wanted_bits_window ) - mbtree_offset - QP_BD_OFFSET ); } } void x264_ratecontrol_delete( x264_t *h ) { x264_ratecontrol_t *rc = h->rc; int b_regular_file; if( rc->p_stat_file_out ) { b_regular_file = x264_is_regular_file( rc->p_stat_file_out ); fclose( rc->p_stat_file_out ); if( h->i_frame >= rc->num_entries && b_regular_file ) if( x264_rename( rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out ) != 0 ) { x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n", rc->psz_stat_file_tmpname, h->param.rc.psz_stat_out ); } x264_free( rc->psz_stat_file_tmpname ); } if( rc->p_mbtree_stat_file_out ) { b_regular_file = x264_is_regular_file( rc->p_mbtree_stat_file_out ); fclose( rc->p_mbtree_stat_file_out ); if( h->i_frame >= rc->num_entries && b_regular_file ) if( x264_rename( rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ) != 0 ) { x264_log( h, X264_LOG_ERROR, "failed to rename \"%s\" to \"%s\"\n", rc->psz_mbtree_stat_file_tmpname, rc->psz_mbtree_stat_file_name ); } x264_free( rc->psz_mbtree_stat_file_tmpname ); x264_free( rc->psz_mbtree_stat_file_name ); } if( rc->p_mbtree_stat_file_in ) fclose( rc->p_mbtree_stat_file_in ); x264_free( rc->pred ); x264_free( rc->pred_b_from_p ); x264_free( rc->entry ); x264_free( rc->entry_out ); macroblock_tree_rescale_destroy( rc ); if( rc->zones ) { x264_param_cleanup( rc->zones[0].param ); x264_free( rc->zones[0].param ); for( int i = 1; i < rc->i_zones; i++ ) if( rc->zones[i].param != rc->zones[0].param && rc->zones[i].param->param_free ) { x264_param_cleanup( rc->zones[i].param ); rc->zones[i].param->param_free( rc->zones[i].param ); } x264_free( rc->zones ); } x264_free( rc ); } static void accum_p_qp_update( x264_t *h, float qp ) { x264_ratecontrol_t *rc = h->rc; rc->accum_p_qp *= .95; rc->accum_p_norm *= .95; rc->accum_p_norm += 1; if( h->sh.i_type == SLICE_TYPE_I ) rc->accum_p_qp += qp + rc->ip_offset; else rc->accum_p_qp += qp; } void x264_ratecontrol_zone_init( x264_t *h ) { x264_ratecontrol_t *rc = h->rc; x264_zone_t *zone = get_zone( h, h->fenc->i_frame ); if( zone && (!rc->prev_zone || zone->param != rc->prev_zone->param) ) x264_encoder_reconfig_apply( h, zone->param ); rc->prev_zone = zone; } /* Before encoding a frame, choose a QP for it */ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead ) { x264_ratecontrol_t *rc = h->rc; ratecontrol_entry_t *rce = NULL; x264_zone_t *zone = get_zone( h, h->fenc->i_frame ); float q; x264_emms(); if( h->param.rc.b_stat_read ) { int frame = h->fenc->i_frame; assert( frame >= 0 && frame < rc->num_entries ); rce = rc->rce = &rc->entry[frame]; if( h->sh.i_type == SLICE_TYPE_B && h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO ) { h->sh.b_direct_spatial_mv_pred = ( rce->direct_mode == 's' ); h->mb.b_direct_auto_read = ( rce->direct_mode == 's' || rce->direct_mode == 't' ); } } if( rc->b_vbv ) { memset( h->fdec->i_row_bits, 0, h->mb.i_mb_height * sizeof(int) ); memset( h->fdec->f_row_qp, 0, h->mb.i_mb_height * sizeof(float) ); memset( h->fdec->f_row_qscale, 0, h->mb.i_mb_height * sizeof(float) ); rc->row_pred = rc->row_preds[h->sh.i_type]; rc->buffer_rate = h->fenc->i_cpb_duration * rc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; update_vbv_plan( h, overhead ); const x264_level_t *l = x264_levels; while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc ) l++; int mincr = l->mincr; if( h->param.b_bluray_compat ) mincr = 4; /* Profiles above High don't require minCR, so just set the maximum to a large value. */ if( h->sps->i_profile_idc > PROFILE_HIGH ) rc->frame_size_maximum = 1e9; else { /* The spec has a bizarre special case for the first frame. */ if( h->i_frame == 0 ) { //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR double fr = 1. / (h->param.i_level_idc >= 60 ? 300 : 172); int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height; rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr; } else { //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR rc->frame_size_maximum = 384 * BIT_DEPTH * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr; } } } if( h->sh.i_type != SLICE_TYPE_B ) rc->bframes = h->fenc->i_bframes; if( rc->b_abr ) { q = qscale2qp( rate_estimate_qscale( h ) ); } else if( rc->b_2pass ) { rce->new_qscale = rate_estimate_qscale( h ); q = qscale2qp( rce->new_qscale ); } else /* CQP */ { if( h->sh.i_type == SLICE_TYPE_B && h->fdec->b_kept_as_ref ) q = ( rc->qp_constant[ SLICE_TYPE_B ] + rc->qp_constant[ SLICE_TYPE_P ] ) / 2; else q = rc->qp_constant[ h->sh.i_type ]; if( zone ) { if( zone->b_force_qp ) q += zone->i_qp - rc->qp_constant[SLICE_TYPE_P]; else q -= 6*log2f( zone->f_bitrate_factor ); } } if( i_force_qp != X264_QP_AUTO ) q = i_force_qp - 1; q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); rc->qpa_rc = rc->qpa_rc_prev = rc->qpa_aq = rc->qpa_aq_prev = 0; h->fdec->f_qp_avg_rc = h->fdec->f_qp_avg_aq = rc->qpm = q; if( rce ) rce->new_qp = q; accum_p_qp_update( h, rc->qpm ); if( h->sh.i_type != SLICE_TYPE_B ) rc->last_non_b_pict_type = h->sh.i_type; } static float predict_row_size( x264_t *h, int y, float qscale ) { /* average between two predictors: * absolute SATD, and scaled bit cost of the colocated row in the previous frame */ x264_ratecontrol_t *rc = h->rc; float pred_s = predict_size( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y] ); if( h->sh.i_type == SLICE_TYPE_I || qscale >= h->fref[0][0]->f_row_qscale[y] ) { if( h->sh.i_type == SLICE_TYPE_P && h->fref[0][0]->i_type == h->fdec->i_type && h->fref[0][0]->f_row_qscale[y] > 0 && h->fref[0][0]->i_row_satd[y] > 0 && (abs(h->fref[0][0]->i_row_satd[y] - h->fdec->i_row_satd[y]) < h->fdec->i_row_satd[y]/2)) { float pred_t = h->fref[0][0]->i_row_bits[y] * h->fdec->i_row_satd[y] / h->fref[0][0]->i_row_satd[y] * h->fref[0][0]->f_row_qscale[y] / qscale; return (pred_s + pred_t) * 0.5f; } return pred_s; } /* Our QP is lower than the reference! */ else { float pred_intra = predict_size( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y] ); /* Sum: better to overestimate than underestimate by using only one of the two predictors. */ return pred_intra + pred_s; } } static int row_bits_so_far( x264_t *h, int y ) { int bits = 0; for( int i = h->i_threadslice_start; i <= y; i++ ) bits += h->fdec->i_row_bits[i]; return bits; } static float predict_row_size_to_end( x264_t *h, int y, float qp ) { float qscale = qp2qscale( qp ); float bits = 0; for( int i = y+1; i < h->i_threadslice_end; i++ ) bits += predict_row_size( h, i, qscale ); return bits; } /* TODO: * eliminate all use of qp in row ratecontrol: make it entirely qscale-based. * make this function stop being needlessly O(N^2) * update more often than once per row? */ int x264_ratecontrol_mb( x264_t *h, int bits ) { x264_ratecontrol_t *rc = h->rc; const int y = h->mb.i_mb_y; h->fdec->i_row_bits[y] += bits; rc->qpa_aq += h->mb.i_qp; if( h->mb.i_mb_x != h->mb.i_mb_width - 1 ) return 0; x264_emms(); rc->qpa_rc += rc->qpm * h->mb.i_mb_width; if( !rc->b_vbv ) return 0; float qscale = qp2qscale( rc->qpm ); h->fdec->f_row_qp[y] = rc->qpm; h->fdec->f_row_qscale[y] = qscale; update_predictor( &rc->row_pred[0], qscale, h->fdec->i_row_satd[y], h->fdec->i_row_bits[y] ); if( h->sh.i_type != SLICE_TYPE_I && rc->qpm < h->fref[0][0]->f_row_qp[y] ) update_predictor( &rc->row_pred[1], qscale, h->fdec->i_row_satds[0][0][y], h->fdec->i_row_bits[y] ); /* update ratecontrol per-mbpair in MBAFF */ if( SLICE_MBAFF && !(y&1) ) return 0; /* FIXME: We don't currently support the case where there's a slice * boundary in between. */ int can_reencode_row = h->sh.i_first_mb <= ((h->mb.i_mb_y - SLICE_MBAFF) * h->mb.i_mb_stride); /* tweak quality based on difference from predicted size */ float prev_row_qp = h->fdec->f_row_qp[y]; float qp_absolute_max = h->param.rc.i_qp_max; if( rc->rate_factor_max_increment ) qp_absolute_max = X264_MIN( qp_absolute_max, rc->qp_novbv + rc->rate_factor_max_increment ); float qp_max = X264_MIN( prev_row_qp + h->param.rc.i_qp_step, qp_absolute_max ); float qp_min = X264_MAX( prev_row_qp - h->param.rc.i_qp_step, h->param.rc.i_qp_min ); float step_size = 0.5f; float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned; float bits_so_far = row_bits_so_far( h, y ); rc->bits_so_far = bits_so_far; float max_frame_error = x264_clip3f( 1.0 / h->mb.i_mb_height, 0.05, 0.25 ); float max_frame_size = rc->frame_size_maximum - rc->frame_size_maximum * max_frame_error; max_frame_size = X264_MIN( max_frame_size, rc->buffer_fill - rc->buffer_rate * max_frame_error ); float size_of_other_slices = 0; if( h->param.b_sliced_threads ) { float bits_so_far_of_other_slices = 0; for( int i = 0; i < h->param.i_threads; i++ ) if( h != h->thread[i] ) { size_of_other_slices += h->thread[i]->rc->frame_size_estimated; bits_so_far_of_other_slices += h->thread[i]->rc->bits_so_far; } float weight = x264_clip3f( (bits_so_far_of_other_slices + rc->frame_size_estimated) / (size_of_other_slices + rc->frame_size_estimated), 0.0, 1.0 ); float frame_size_planned = rc->frame_size_planned - rc->frame_size_planned * max_frame_error; float size_of_other_slices_planned = X264_MIN( frame_size_planned, max_frame_size ) - rc->slice_size_planned; size_of_other_slices_planned = X264_MAX( size_of_other_slices_planned, bits_so_far_of_other_slices ); size_of_other_slices = (size_of_other_slices - size_of_other_slices_planned) * weight + size_of_other_slices_planned; } if( y < h->i_threadslice_end-1 ) { /* B-frames shouldn't use lower QP than their reference frames. */ if( h->sh.i_type == SLICE_TYPE_B ) { qp_min = X264_MAX( qp_min, X264_MAX( h->fref[0][0]->f_row_qp[y+1], h->fref[1][0]->f_row_qp[y+1] ) ); rc->qpm = X264_MAX( rc->qpm, qp_min ); } float buffer_left_planned = rc->buffer_fill - rc->frame_size_planned; buffer_left_planned = X264_MAX( buffer_left_planned, 0.f ); /* More threads means we have to be more cautious in letting ratecontrol use up extra bits. */ float rc_tol = buffer_left_planned / h->param.i_threads * rc->rate_tolerance; float b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices; float trust_coeff = x264_clip3f( bits_so_far / slice_size_planned, 0.0, 1.0 ); /* Don't increase the row QPs until a sufficient amount of the bits of the frame have been processed, in case a flat */ /* area at the top of the frame was measured inaccurately. */ if( trust_coeff < 0.05f ) qp_max = qp_absolute_max = prev_row_qp; if( h->sh.i_type != SLICE_TYPE_I ) rc_tol *= 0.5f; if( !rc->b_vbv_min_rate ) qp_min = X264_MAX( qp_min, rc->qp_novbv ); while( rc->qpm < qp_max && ((b1 > rc->frame_size_planned + rc_tol) || (b1 > rc->frame_size_planned && rc->qpm < rc->qp_novbv) || (b1 > rc->buffer_fill - buffer_left_planned * 0.5f)) ) { rc->qpm += step_size; b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices; } float b_max = b1 + ((rc->buffer_fill - rc->buffer_size + rc->buffer_rate) * 0.90f - b1) * trust_coeff; rc->qpm -= step_size; float b2 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices; while( rc->qpm > qp_min && rc->qpm < prev_row_qp && (rc->qpm > h->fdec->f_row_qp[0] || rc->single_frame_vbv) && (b2 < max_frame_size) && ((b2 < rc->frame_size_planned * 0.8f) || (b2 < b_max)) ) { b1 = b2; rc->qpm -= step_size; b2 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices; } rc->qpm += step_size; /* avoid VBV underflow or MinCR violation */ while( rc->qpm < qp_absolute_max && (b1 > max_frame_size) ) { rc->qpm += step_size; b1 = bits_so_far + predict_row_size_to_end( h, y, rc->qpm ) + size_of_other_slices; } rc->frame_size_estimated = b1 - size_of_other_slices; /* If the current row was large enough to cause a large QP jump, try re-encoding it. */ if( rc->qpm > qp_max && prev_row_qp < qp_max && can_reencode_row ) { /* Bump QP to halfway in between... close enough. */ rc->qpm = x264_clip3f( (prev_row_qp + rc->qpm)*0.5f, prev_row_qp + 1.0f, qp_max ); rc->qpa_rc = rc->qpa_rc_prev; rc->qpa_aq = rc->qpa_aq_prev; h->fdec->i_row_bits[y] = 0; h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; return -1; } } else { rc->frame_size_estimated = bits_so_far; /* Last-ditch attempt: if the last row of the frame underflowed the VBV, * try again. */ if( rc->qpm < qp_max && can_reencode_row && (bits_so_far + size_of_other_slices > X264_MIN( rc->frame_size_maximum, rc->buffer_fill )) ) { rc->qpm = qp_max; rc->qpa_rc = rc->qpa_rc_prev; rc->qpa_aq = rc->qpa_aq_prev; h->fdec->i_row_bits[y] = 0; h->fdec->i_row_bits[y-SLICE_MBAFF] = 0; return -1; } } rc->qpa_rc_prev = rc->qpa_rc; rc->qpa_aq_prev = rc->qpa_aq; return 0; } int x264_ratecontrol_qp( x264_t *h ) { x264_emms(); return x264_clip3( h->rc->qpm + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } int x264_ratecontrol_mb_qp( x264_t *h ) { x264_emms(); float qp = h->rc->qpm; if( h->param.rc.i_aq_mode ) { /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */ float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy]; /* Scale AQ's effect towards zero in emergency mode. */ if( qp > QP_MAX_SPEC ) qp_offset *= (QP_MAX - qp) / (QP_MAX - QP_MAX_SPEC); qp += qp_offset; } return x264_clip3( qp + 0.5f, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); } /* In 2pass, force the same frame types as in the 1st pass */ int x264_ratecontrol_slice_type( x264_t *h, int frame_num ) { x264_ratecontrol_t *rc = h->rc; if( h->param.rc.b_stat_read ) { if( frame_num >= rc->num_entries ) { /* We could try to initialize everything required for ABR and * adaptive B-frames, but that would be complicated. * So just calculate the average QP used so far. */ h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24 + QP_BD_OFFSET : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P]; rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX ); rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / h->param.rc.f_ip_factor ) + 0.5 ), 0, QP_MAX ); rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * h->param.rc.f_pb_factor ) + 0.5 ), 0, QP_MAX ); x264_log( h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries ); x264_log( h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant ); if( h->param.i_bframe_adaptive ) x264_log( h, X264_LOG_ERROR, "disabling adaptive B-frames\n" ); for( int i = 0; i < h->param.i_threads; i++ ) { h->thread[i]->rc->b_abr = 0; h->thread[i]->rc->b_2pass = 0; h->thread[i]->param.rc.i_rc_method = X264_RC_CQP; h->thread[i]->param.rc.b_stat_read = 0; h->thread[i]->param.i_bframe_adaptive = 0; h->thread[i]->param.i_scenecut_threshold = 0; h->thread[i]->param.rc.b_mb_tree = 0; if( h->thread[i]->param.i_bframe > 1 ) h->thread[i]->param.i_bframe = 1; } return X264_TYPE_AUTO; } return rc->entry[frame_num].frame_type; } else return X264_TYPE_AUTO; } void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm ) { ratecontrol_entry_t *rce = &h->rc->entry[frm->i_frame]; if( h->param.analyse.i_weighted_pred <= 0 ) return; if( rce->i_weight_denom[0] >= 0 ) SET_WEIGHT( frm->weight[0][0], 1, rce->weight[0][0], rce->i_weight_denom[0], rce->weight[0][1] ); if( rce->i_weight_denom[1] >= 0 ) { SET_WEIGHT( frm->weight[0][1], 1, rce->weight[1][0], rce->i_weight_denom[1], rce->weight[1][1] ); SET_WEIGHT( frm->weight[0][2], 1, rce->weight[2][0], rce->i_weight_denom[1], rce->weight[2][1] ); } } /* After encoding one frame, save stats and update ratecontrol state */ int x264_ratecontrol_end( x264_t *h, int bits, int *filler ) { x264_ratecontrol_t *rc = h->rc; const int *mbs = h->stat.frame.i_mb_count; x264_emms(); h->stat.frame.i_mb_count_skip = mbs[P_SKIP] + mbs[B_SKIP]; h->stat.frame.i_mb_count_i = mbs[I_16x16] + mbs[I_8x8] + mbs[I_4x4] + mbs[I_PCM]; h->stat.frame.i_mb_count_p = mbs[P_L0] + mbs[P_8x8]; for( int i = B_DIRECT; i <= B_8x8; i++ ) h->stat.frame.i_mb_count_p += mbs[i]; h->fdec->f_qp_avg_rc = rc->qpa_rc /= h->mb.i_mb_count; h->fdec->f_qp_avg_aq = (float)rc->qpa_aq / h->mb.i_mb_count; h->fdec->f_crf_avg = h->param.rc.f_rf_constant + h->fdec->f_qp_avg_rc - rc->qp_novbv; if( h->param.rc.b_stat_write ) { char c_type = h->sh.i_type==SLICE_TYPE_I ? (h->fenc->i_poc==0 ? 'I' : 'i') : h->sh.i_type==SLICE_TYPE_P ? 'P' : h->fenc->b_kept_as_ref ? 'B' : 'b'; int dir_frame = h->stat.frame.i_direct_score[1] - h->stat.frame.i_direct_score[0]; int dir_avg = h->stat.i_direct_score[1] - h->stat.i_direct_score[0]; char c_direct = h->mb.b_direct_auto_write ? ( dir_frame>0 ? 's' : dir_frame<0 ? 't' : dir_avg>0 ? 's' : dir_avg<0 ? 't' : '-' ) : '-'; if( fprintf( rc->p_stat_file_out, "in:%d out:%d type:%c dur:%"PRId64" cpbdur:%"PRId64" q:%.2f aq:%.2f tex:%d mv:%d misc:%d imb:%d pmb:%d smb:%d d:%c ref:", h->fenc->i_frame, h->i_frame, c_type, h->fenc->i_duration, h->fenc->i_cpb_duration, rc->qpa_rc, h->fdec->f_qp_avg_aq, h->stat.frame.i_tex_bits, h->stat.frame.i_mv_bits, h->stat.frame.i_misc_bits, h->stat.frame.i_mb_count_i, h->stat.frame.i_mb_count_p, h->stat.frame.i_mb_count_skip, c_direct) < 0 ) goto fail; /* Only write information for reference reordering once. */ int use_old_stats = h->param.rc.b_stat_read && rc->rce->refs > 1; for( int i = 0; i < (use_old_stats ? rc->rce->refs : h->i_ref[0]); i++ ) { int refcount = use_old_stats ? rc->rce->refcount[i] : PARAM_INTERLACED ? h->stat.frame.i_mb_count_ref[0][i*2] + h->stat.frame.i_mb_count_ref[0][i*2+1] : h->stat.frame.i_mb_count_ref[0][i]; if( fprintf( rc->p_stat_file_out, "%d ", refcount ) < 0 ) goto fail; } if( h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE && h->sh.weight[0][0].weightfn ) { if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d", h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 ) goto fail; if( h->sh.weight[0][1].weightfn || h->sh.weight[0][2].weightfn ) { if( fprintf( rc->p_stat_file_out, ",%d,%d,%d,%d,%d ", h->sh.weight[0][1].i_denom, h->sh.weight[0][1].i_scale, h->sh.weight[0][1].i_offset, h->sh.weight[0][2].i_scale, h->sh.weight[0][2].i_offset ) < 0 ) goto fail; } else if( fprintf( rc->p_stat_file_out, " " ) < 0 ) goto fail; } if( fprintf( rc->p_stat_file_out, ";\n") < 0 ) goto fail; /* Don't re-write the data in multi-pass mode. */ if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read ) { uint8_t i_type = h->sh.i_type; h->mc.mbtree_fix8_pack( rc->mbtree.qp_buffer[0], h->fenc->f_qp_offset, h->mb.i_mb_count ); if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 ) goto fail; if( fwrite( rc->mbtree.qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < (unsigned)h->mb.i_mb_count ) goto fail; } } if( rc->b_abr ) { if( h->sh.i_type != SLICE_TYPE_B ) rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / rc->last_rceq; else { /* Depends on the fact that B-frame's QP is an offset from the following P-frame's. * Not perfectly accurate with B-refs, but good enough. */ rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * h->param.rc.f_pb_factor); } rc->cplxr_sum *= rc->cbr_decay; rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate; rc->wanted_bits_window *= rc->cbr_decay; } if( rc->b_2pass ) rc->expected_bits_sum += qscale2bits( rc->rce, qp2qscale( rc->rce->new_qp ) ); if( h->mb.b_variable_qp ) { if( h->sh.i_type == SLICE_TYPE_B ) { rc->bframe_bits += bits; if( h->fenc->b_last_minigop_bframe ) { update_predictor( rc->pred_b_from_p, qp2qscale( rc->qpa_rc ), h->fref[1][h->i_ref[1]-1]->i_satd, rc->bframe_bits / rc->bframes ); rc->bframe_bits = 0; } } } *filler = update_vbv( h, bits ); rc->filler_bits_sum += *filler * 8; if( h->sps->vui.b_nal_hrd_parameters_present ) { if( h->fenc->i_frame == 0 ) { // access unit initialises the HRD h->fenc->hrd_timing.cpb_initial_arrival_time = 0; rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay; rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset; h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit = (double)rc->initial_cpb_removal_delay / 90000; } else { h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit + (double)(h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset) * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; if( h->fenc->b_keyframe ) { rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time; rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay; rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset; } double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000; if( !h->fenc->b_keyframe ) cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000; if( h->sps->vui.hrd.b_cbr_hrd ) h->fenc->hrd_timing.cpb_initial_arrival_time = rc->previous_cpb_final_arrival_time; else h->fenc->hrd_timing.cpb_initial_arrival_time = X264_MAX( rc->previous_cpb_final_arrival_time, cpb_earliest_arrival_time ); } int filler_bits = *filler ? X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), *filler )*8 : 0; // Equation C-6 h->fenc->hrd_timing.cpb_final_arrival_time = rc->previous_cpb_final_arrival_time = h->fenc->hrd_timing.cpb_initial_arrival_time + (double)(bits + filler_bits) / h->sps->vui.hrd.i_bit_rate_unscaled; h->fenc->hrd_timing.dpb_output_time = (double)h->fenc->i_dpb_output_delay * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale + h->fenc->hrd_timing.cpb_removal_time; } return 0; fail: x264_log( h, X264_LOG_ERROR, "ratecontrol_end: stats file could not be written to\n" ); return -1; } /**************************************************************************** * 2 pass functions ***************************************************************************/ /** * modify the bitrate curve from pass1 for one frame */ static double get_qscale(x264_t *h, ratecontrol_entry_t *rce, double rate_factor, int frame_num) { x264_ratecontrol_t *rcc= h->rc; x264_zone_t *zone = get_zone( h, frame_num ); double q; if( h->param.rc.b_mb_tree ) { double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; q = pow( BASE_FRAME_DURATION / CLIP_DURATION(rce->i_duration * timescale), 1 - h->param.rc.f_qcompress ); } else q = pow( rce->blurred_complexity, 1 - rcc->qcompress ); // avoid NaN's in the rc_eq if( !isfinite(q) || rce->tex_bits + rce->mv_bits == 0 ) q = rcc->last_qscale_for[rce->pict_type]; else { rcc->last_rceq = q; q /= rate_factor; rcc->last_qscale = q; } if( zone ) { if( zone->b_force_qp ) q = qp2qscale( zone->i_qp ); else q /= zone->f_bitrate_factor; } return q; } static double get_diff_limited_q(x264_t *h, ratecontrol_entry_t *rce, double q, int frame_num) { x264_ratecontrol_t *rcc = h->rc; const int pict_type = rce->pict_type; x264_zone_t *zone = get_zone( h, frame_num ); // force I/B quants as a function of P quants if( pict_type == SLICE_TYPE_I ) { double iq = q; double pq = qp2qscale( rcc->accum_p_qp / rcc->accum_p_norm ); double ip_factor = h->param.rc.f_ip_factor; /* don't apply ip_factor if the following frame is also I */ if( rcc->accum_p_norm <= 0 ) q = iq; else if( rcc->accum_p_norm >= 1 ) q = pq / ip_factor; else q = rcc->accum_p_norm * pq / ip_factor + (1 - rcc->accum_p_norm) * iq; } else if( pict_type == SLICE_TYPE_B ) { q = rcc->last_qscale_for[rcc->last_non_b_pict_type]; if( !rce->kept_as_ref ) q *= h->param.rc.f_pb_factor; } else if( pict_type == SLICE_TYPE_P && rcc->last_non_b_pict_type == SLICE_TYPE_P && rce->tex_bits == 0 ) { q = rcc->last_qscale_for[SLICE_TYPE_P]; } /* last qscale / qdiff stuff */ if( rcc->last_non_b_pict_type == pict_type && (pict_type!=SLICE_TYPE_I || rcc->last_accum_p_norm < 1) ) { double last_q = rcc->last_qscale_for[pict_type]; double max_qscale = last_q * rcc->lstep; double min_qscale = last_q / rcc->lstep; if ( q > max_qscale ) q = max_qscale; else if( q < min_qscale ) q = min_qscale; } rcc->last_qscale_for[pict_type] = q; if( pict_type != SLICE_TYPE_B ) rcc->last_non_b_pict_type = pict_type; if( pict_type == SLICE_TYPE_I ) { rcc->last_accum_p_norm = rcc->accum_p_norm; rcc->accum_p_norm = 0; rcc->accum_p_qp = 0; } if( pict_type == SLICE_TYPE_P ) { float mask = 1 - pow( (float)rce->i_count / rcc->nmb, 2 ); rcc->accum_p_qp = mask * (qscale2qp( q ) + rcc->accum_p_qp); rcc->accum_p_norm = mask * (1 + rcc->accum_p_norm); } if( zone ) { if( zone->b_force_qp ) q = qp2qscale( zone->i_qp ); else q /= zone->f_bitrate_factor; } return q; } static float predict_size( predictor_t *p, float q, float var ) { return (p->coeff*var + p->offset) / (q*p->count); } static void update_predictor( predictor_t *p, float q, float var, float bits ) { float range = 1.5; if( var < 10 ) return; float old_coeff = p->coeff / p->count; float old_offset = p->offset / p->count; float new_coeff = X264_MAX( (bits*q - old_offset) / var, p->coeff_min ); float new_coeff_clipped = x264_clip3f( new_coeff, old_coeff/range, old_coeff*range ); float new_offset = bits*q - new_coeff_clipped * var; if( new_offset >= 0 ) new_coeff = new_coeff_clipped; else new_offset = 0; p->count *= p->decay; p->coeff *= p->decay; p->offset *= p->decay; p->count ++; p->coeff += new_coeff; p->offset += new_offset; } // update VBV after encoding a frame static int update_vbv( x264_t *h, int bits ) { int filler = 0; int bitrate = h->sps->vui.hrd.i_bit_rate_unscaled; x264_ratecontrol_t *rcc = h->rc; x264_ratecontrol_t *rct = h->thread[0]->rc; int64_t buffer_size = (int64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale; if( rcc->last_satd >= h->mb.i_mb_count ) update_predictor( &rct->pred[h->sh.i_type], qp2qscale( rcc->qpa_rc ), rcc->last_satd, bits ); if( !rcc->b_vbv ) return filler; uint64_t buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale; rct->buffer_fill_final -= buffer_diff; rct->buffer_fill_final_min -= buffer_diff; if( rct->buffer_fill_final_min < 0 ) { double underflow = (double)rct->buffer_fill_final_min / h->sps->vui.i_time_scale; if( rcc->rate_factor_max_increment && rcc->qpm >= rcc->qp_novbv + rcc->rate_factor_max_increment ) x264_log( h, X264_LOG_DEBUG, "VBV underflow due to CRF-max (frame %d, %.0f bits)\n", h->i_frame, underflow ); else x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, underflow ); rct->buffer_fill_final = rct->buffer_fill_final_min = 0; } if( h->param.i_avcintra_class ) buffer_diff = buffer_size; else buffer_diff = (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration; rct->buffer_fill_final += buffer_diff; rct->buffer_fill_final_min += buffer_diff; if( rct->buffer_fill_final > buffer_size ) { if( h->param.rc.b_filler ) { int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8; filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale; bits = h->param.i_avcintra_class ? filler * 8 : X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8; buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale; rct->buffer_fill_final -= buffer_diff; rct->buffer_fill_final_min -= buffer_diff; } else { rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size ); rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, buffer_size ); } } return filler; } void x264_hrd_fullness( x264_t *h ) { x264_ratecontrol_t *rct = h->thread[0]->rc; uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale / rct->hrd_multiply_denom; uint64_t cpb_state = rct->buffer_fill_final; uint64_t cpb_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale; uint64_t multiply_factor = 90000 / rct->hrd_multiply_denom; if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > (int64_t)cpb_size ) { x264_log( h, X264_LOG_WARNING, "CPB %s: %.0f bits in a %.0f-bit buffer\n", rct->buffer_fill_final < 0 ? "underflow" : "overflow", (double)rct->buffer_fill_final / h->sps->vui.i_time_scale, (double)cpb_size / h->sps->vui.i_time_scale ); } h->initial_cpb_removal_delay = (multiply_factor * cpb_state) / denom; h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size) / denom - h->initial_cpb_removal_delay; int64_t decoder_buffer_fill = h->initial_cpb_removal_delay * denom / multiply_factor; rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, decoder_buffer_fill ); } // provisionally update VBV according to the planned size of all frames currently in progress static void update_vbv_plan( x264_t *h, int overhead ) { x264_ratecontrol_t *rcc = h->rc; rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final_min / h->sps->vui.i_time_scale; if( h->i_thread_frames > 1 ) { int j = rcc - h->thread[0]->rc; for( int i = 1; i < h->i_thread_frames; i++ ) { x264_t *t = h->thread[ (j+i)%h->i_thread_frames ]; double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; bits = X264_MAX(bits, t->rc->frame_size_estimated); rcc->buffer_fill -= bits; rcc->buffer_fill = X264_MAX( rcc->buffer_fill, 0 ); rcc->buffer_fill += t->rc->buffer_rate; rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size ); } } rcc->buffer_fill = X264_MIN( rcc->buffer_fill, rcc->buffer_size ); rcc->buffer_fill -= overhead; } // clip qscale to between lmin and lmax static double clip_qscale( x264_t *h, int pict_type, double q ) { x264_ratecontrol_t *rcc = h->rc; double lmin = rcc->lmin[pict_type]; double lmax = rcc->lmax[pict_type]; if( rcc->rate_factor_max_increment ) lmax = X264_MIN( lmax, qp2qscale( rcc->qp_novbv + rcc->rate_factor_max_increment ) ); if( lmin==lmax ) return lmin; else if( rcc->b_2pass ) { double min2 = log( lmin ); double max2 = log( lmax ); q = (log(q) - min2)/(max2-min2) - 0.5; q = 1.0/(1.0 + exp( -4*q )); q = q*(max2-min2) + min2; return exp( q ); } else return x264_clip3f( q, lmin, lmax ); } // apply VBV constraints static double vbv_pass1( x264_t *h, int pict_type, double q ) { x264_ratecontrol_t *rcc = h->rc; /* B-frames are not directly subject to VBV, * since they are controlled by the P-frames' QPs. */ if( rcc->b_vbv && rcc->last_satd > 0 ) { double q0 = q; double fenc_cpb_duration = (double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; /* Lookahead VBV: raise the quantizer as necessary such that no frames in * the lookahead overflow and such that the buffer is in a reasonable state * by the end of the lookahead. */ if( h->param.rc.i_lookahead ) { int terminate = 0; /* Avoid an infinite loop. */ for( int iterations = 0; iterations < 1000 && terminate != 3; iterations++ ) { double frame_q[3]; double cur_bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd ); double buffer_fill_cur = rcc->buffer_fill - cur_bits; double target_fill; double total_duration = 0; double last_duration = fenc_cpb_duration; frame_q[0] = h->sh.i_type == SLICE_TYPE_I ? q * h->param.rc.f_ip_factor : q; frame_q[1] = frame_q[0] * h->param.rc.f_pb_factor; frame_q[2] = frame_q[0] / h->param.rc.f_ip_factor; /* Loop over the planned future frames. */ for( int j = 0; buffer_fill_cur >= 0 && buffer_fill_cur <= rcc->buffer_size; j++ ) { total_duration += last_duration; buffer_fill_cur += rcc->vbv_max_rate * last_duration; int i_type = h->fenc->i_planned_type[j]; int i_satd = h->fenc->i_planned_satd[j]; if( i_type == X264_TYPE_AUTO ) break; i_type = IS_X264_TYPE_I( i_type ) ? SLICE_TYPE_I : IS_X264_TYPE_B( i_type ) ? SLICE_TYPE_B : SLICE_TYPE_P; cur_bits = predict_size( &rcc->pred[i_type], frame_q[i_type], i_satd ); buffer_fill_cur -= cur_bits; last_duration = h->fenc->f_planned_cpb_duration[j]; } /* Try to get to get the buffer at least 50% filled, but don't set an impossible goal. */ target_fill = X264_MIN( rcc->buffer_fill + total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.5 ); if( buffer_fill_cur < target_fill ) { q *= 1.01; terminate |= 1; continue; } /* Try to get the buffer no more than 80% filled, but don't set an impossible goal. */ target_fill = x264_clip3f( rcc->buffer_fill - total_duration * rcc->vbv_max_rate * 0.5, rcc->buffer_size * 0.8, rcc->buffer_size ); if( rcc->b_vbv_min_rate && buffer_fill_cur > target_fill ) { q /= 1.01; terminate |= 2; continue; } break; } } /* Fallback to old purely-reactive algorithm: no lookahead. */ else { if( ( pict_type == SLICE_TYPE_P || ( pict_type == SLICE_TYPE_I && rcc->last_non_b_pict_type == SLICE_TYPE_I ) ) && rcc->buffer_fill/rcc->buffer_size < 0.5 ) { q /= x264_clip3f( 2.0*rcc->buffer_fill/rcc->buffer_size, 0.5, 1.0 ); } /* Now a hard threshold to make sure the frame fits in VBV. * This one is mostly for I-frames. */ double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd ); /* For small VBVs, allow the frame to use up the entire VBV. */ double max_fill_factor = h->param.rc.i_vbv_buffer_size >= 5*h->param.rc.i_vbv_max_bitrate / rcc->fps ? 2 : 1; /* For single-frame VBVs, request that the frame use up the entire VBV. */ double min_fill_factor = rcc->single_frame_vbv ? 1 : 2; if( bits > rcc->buffer_fill/max_fill_factor ) { double qf = x264_clip3f( rcc->buffer_fill/(max_fill_factor*bits), 0.2, 1.0 ); q /= qf; bits *= qf; } if( bits < rcc->buffer_rate/min_fill_factor ) { double qf = x264_clip3f( bits*min_fill_factor/rcc->buffer_rate, 0.001, 1.0 ); q *= qf; } q = X264_MAX( q0, q ); } /* Check B-frame complexity, and use up any bits that would * overflow before the next P-frame. */ if( h->sh.i_type == SLICE_TYPE_P && !rcc->single_frame_vbv ) { int nb = rcc->bframes; double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd ); double pbbits = bits; double bbits = predict_size( rcc->pred_b_from_p, q * h->param.rc.f_pb_factor, rcc->last_satd ); double space; double bframe_cpb_duration = 0; double minigop_cpb_duration; for( int i = 0; i < nb; i++ ) bframe_cpb_duration += h->fenc->f_planned_cpb_duration[i]; if( bbits * nb > bframe_cpb_duration * rcc->vbv_max_rate ) { nb = 0; bframe_cpb_duration = 0; } pbbits += nb * bbits; minigop_cpb_duration = bframe_cpb_duration + fenc_cpb_duration; space = rcc->buffer_fill + minigop_cpb_duration*rcc->vbv_max_rate - rcc->buffer_size; if( pbbits < space ) { q *= X264_MAX( pbbits / space, bits / (0.5 * rcc->buffer_size) ); } q = X264_MAX( q0/2, q ); } if( !rcc->b_vbv_min_rate ) q = X264_MAX( q0, q ); } return clip_qscale( h, pict_type, q ); } // update qscale for 1 frame based on actual bits used so far static float rate_estimate_qscale( x264_t *h ) { float q; x264_ratecontrol_t *rcc = h->rc; ratecontrol_entry_t rce = {0}; int pict_type = h->sh.i_type; int64_t total_bits = 8*(h->stat.i_frame_size[SLICE_TYPE_I] + h->stat.i_frame_size[SLICE_TYPE_P] + h->stat.i_frame_size[SLICE_TYPE_B]) - rcc->filler_bits_sum; if( rcc->b_2pass ) { rce = *rcc->rce; if( pict_type != rce.pict_type ) { x264_log( h, X264_LOG_ERROR, "slice=%c but 2pass stats say %c\n", slice_type_to_char[pict_type], slice_type_to_char[rce.pict_type] ); } } if( pict_type == SLICE_TYPE_B ) { /* B-frames don't have independent ratecontrol, but rather get the * average QP of the two adjacent P-frames + an offset */ int i0 = IS_X264_TYPE_I(h->fref_nearest[0]->i_type); int i1 = IS_X264_TYPE_I(h->fref_nearest[1]->i_type); int dt0 = abs(h->fenc->i_poc - h->fref_nearest[0]->i_poc); int dt1 = abs(h->fenc->i_poc - h->fref_nearest[1]->i_poc); float q0 = h->fref_nearest[0]->f_qp_avg_rc; float q1 = h->fref_nearest[1]->f_qp_avg_rc; if( h->fref_nearest[0]->i_type == X264_TYPE_BREF ) q0 -= rcc->pb_offset/2; if( h->fref_nearest[1]->i_type == X264_TYPE_BREF ) q1 -= rcc->pb_offset/2; if( i0 && i1 ) q = (q0 + q1) / 2 + rcc->ip_offset; else if( i0 ) q = q1; else if( i1 ) q = q0; else q = (q0*dt1 + q1*dt0) / (dt0 + dt1); if( h->fenc->b_kept_as_ref ) q += rcc->pb_offset/2; else q += rcc->pb_offset; rcc->qp_novbv = q; q = qp2qscale( q ); if( rcc->b_2pass ) rcc->frame_size_planned = qscale2bits( &rce, q ); else rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref[1][h->i_ref[1]-1]->i_satd ); /* Apply MinCR and buffer fill restrictions */ if( rcc->b_vbv ) { double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) ); if( rcc->frame_size_planned > frame_size_maximum ) { q *= rcc->frame_size_planned / frame_size_maximum; rcc->frame_size_planned = frame_size_maximum; } } rcc->frame_size_estimated = rcc->frame_size_planned; /* For row SATDs */ if( rcc->b_vbv ) rcc->last_satd = x264_rc_analyse_slice( h ); return q; } else { double abr_buffer = 2 * rcc->rate_tolerance * rcc->bitrate; double predicted_bits = total_bits; if( h->i_thread_frames > 1 ) { int j = rcc - h->thread[0]->rc; for( int i = 1; i < h->i_thread_frames; i++ ) { x264_t *t = h->thread[(j+i) % h->i_thread_frames]; double bits = t->rc->frame_size_planned; if( !t->b_thread_active ) continue; bits = X264_MAX(bits, t->rc->frame_size_estimated); predicted_bits += bits; } } if( rcc->b_2pass ) { double lmin = rcc->lmin[pict_type]; double lmax = rcc->lmax[pict_type]; double diff; /* Adjust ABR buffer based on distance to the end of the video. */ if( rcc->num_entries > h->i_frame ) { double final_bits = rcc->entry_out[rcc->num_entries-1]->expected_bits; double video_pos = rce.expected_bits / final_bits; double scale_factor = sqrt( (1 - video_pos) * rcc->num_entries ); abr_buffer *= 0.5 * X264_MAX( scale_factor, 0.5 ); } diff = predicted_bits - rce.expected_bits; q = rce.new_qscale; q /= x264_clip3f((abr_buffer - diff) / abr_buffer, .5, 2); if( h->i_frame >= rcc->fps && rcc->expected_bits_sum >= 1 ) { /* Adjust quant based on the difference between * achieved and expected bitrate so far */ double cur_time = (double)h->i_frame / rcc->num_entries; double w = x264_clip3f( cur_time*100, 0.0, 1.0 ); q *= pow( (double)total_bits / rcc->expected_bits_sum, w ); } rcc->qp_novbv = qscale2qp( q ); if( rcc->b_vbv ) { /* Do not overflow vbv */ double expected_size = qscale2bits( &rce, q ); double expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size; double expected_fullness = rce.expected_vbv / rcc->buffer_size; double qmax = q*(2 - expected_fullness); double size_constraint = 1 + expected_fullness; qmax = X264_MAX( qmax, rce.new_qscale ); if( expected_fullness < .05 ) qmax = lmax; qmax = X264_MIN(qmax, lmax); while( ((expected_vbv < rce.expected_vbv/size_constraint) && (q < qmax)) || ((expected_vbv < 0) && (q < lmax))) { q *= 1.05; expected_size = qscale2bits(&rce, q); expected_vbv = rcc->buffer_fill + rcc->buffer_rate - expected_size; } rcc->last_satd = x264_rc_analyse_slice( h ); } q = x264_clip3f( q, lmin, lmax ); } else /* 1pass ABR */ { /* Calculate the quantizer which would have produced the desired * average bitrate if it had been applied to all frames so far. * Then modulate that quant based on the current frame's complexity * relative to the average complexity so far (using the 2pass RCEQ). * Then bias the quant up or down if total size so far was far from * the target. * Result: Depending on the value of rate_tolerance, there is a * tradeoff between quality and bitrate precision. But at large * tolerances, the bit distribution approaches that of 2pass. */ double wanted_bits, overflow = 1; rcc->last_satd = x264_rc_analyse_slice( h ); rcc->short_term_cplxsum *= 0.5; rcc->short_term_cplxcount *= 0.5; rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION); rcc->short_term_cplxcount ++; rce.tex_bits = rcc->last_satd; rce.blurred_complexity = rcc->short_term_cplxsum / rcc->short_term_cplxcount; rce.mv_bits = 0; rce.p_count = rcc->nmb; rce.i_count = 0; rce.s_count = 0; rce.qscale = 1; rce.pict_type = pict_type; rce.i_duration = h->fenc->i_duration; if( h->param.rc.i_rc_method == X264_RC_CRF ) { q = get_qscale( h, &rce, rcc->rate_factor_constant, h->fenc->i_frame ); } else { q = get_qscale( h, &rce, rcc->wanted_bits_window / rcc->cplxr_sum, h->fenc->i_frame ); /* ABR code can potentially be counterproductive in CBR, so just don't bother. * Don't run it if the frame complexity is zero either. */ if( !rcc->b_vbv_min_rate && rcc->last_satd ) { // FIXME is it simpler to keep track of wanted_bits in ratecontrol_end? int i_frame_done = h->i_frame; double time_done = i_frame_done / rcc->fps; if( h->param.b_vfr_input && i_frame_done > 0 ) time_done = ((double)(h->fenc->i_reordered_pts - h->i_reordered_pts_delay)) * h->param.i_timebase_num / h->param.i_timebase_den; wanted_bits = time_done * rcc->bitrate; if( wanted_bits > 0 ) { abr_buffer *= X264_MAX( 1, sqrt( time_done ) ); overflow = x264_clip3f( 1.0 + (predicted_bits - wanted_bits) / abr_buffer, .5, 2 ); q *= overflow; } } } if( pict_type == SLICE_TYPE_I && h->param.i_keyint_max > 1 /* should test _next_ pict type, but that isn't decided yet */ && rcc->last_non_b_pict_type != SLICE_TYPE_I ) { q = qp2qscale( rcc->accum_p_qp / rcc->accum_p_norm ); q /= h->param.rc.f_ip_factor; } else if( h->i_frame > 0 ) { if( h->param.rc.i_rc_method != X264_RC_CRF ) { /* Asymmetric clipping, because symmetric would prevent * overflow control in areas of rapidly oscillating complexity */ double lmin = rcc->last_qscale_for[pict_type] / rcc->lstep; double lmax = rcc->last_qscale_for[pict_type] * rcc->lstep; if( overflow > 1.1 && h->i_frame > 3 ) lmax *= rcc->lstep; else if( overflow < 0.9 ) lmin /= rcc->lstep; q = x264_clip3f(q, lmin, lmax); } } else if( h->param.rc.i_rc_method == X264_RC_CRF && rcc->qcompress != 1 ) { q = qp2qscale( ABR_INIT_QP ) / h->param.rc.f_ip_factor; } rcc->qp_novbv = qscale2qp( q ); q = vbv_pass1( h, pict_type, q ); } rcc->last_qscale_for[pict_type] = rcc->last_qscale = q; if( !(rcc->b_2pass && !rcc->b_vbv) && h->fenc->i_frame == 0 ) rcc->last_qscale_for[SLICE_TYPE_P] = q * h->param.rc.f_ip_factor; if( rcc->b_2pass ) rcc->frame_size_planned = qscale2bits( &rce, q ); else rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd ); /* Apply MinCR and buffer fill restrictions */ if( rcc->b_vbv ) { double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) ); if( rcc->frame_size_planned > frame_size_maximum ) { q *= rcc->frame_size_planned / frame_size_maximum; rcc->frame_size_planned = frame_size_maximum; } /* Always use up the whole VBV in this case. */ if( rcc->single_frame_vbv ) rcc->frame_size_planned = X264_MIN( rcc->buffer_rate, frame_size_maximum ); } rcc->frame_size_estimated = rcc->frame_size_planned; return q; } } static void threads_normalize_predictors( x264_t *h ) { double totalsize = 0; for( int i = 0; i < h->param.i_threads; i++ ) totalsize += h->thread[i]->rc->slice_size_planned; double factor = h->rc->frame_size_planned / totalsize; for( int i = 0; i < h->param.i_threads; i++ ) h->thread[i]->rc->slice_size_planned *= factor; } void x264_threads_distribute_ratecontrol( x264_t *h ) { int row; x264_ratecontrol_t *rc = h->rc; x264_emms(); float qscale = qp2qscale( rc->qpm ); /* Initialize row predictors */ if( h->i_frame == 0 ) for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; if( t != h ) memcpy( t->rc->row_preds, rc->row_preds, sizeof(rc->row_preds) ); } for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; if( t != h ) memcpy( t->rc, rc, offsetof(x264_ratecontrol_t, row_pred) ); t->rc->row_pred = t->rc->row_preds[h->sh.i_type]; /* Calculate the planned slice size. */ if( rc->b_vbv && rc->frame_size_planned ) { int size = 0; for( row = t->i_threadslice_start; row < t->i_threadslice_end; row++ ) size += h->fdec->i_row_satd[row]; t->rc->slice_size_planned = predict_size( &rc->pred[h->sh.i_type + (i+1)*5], qscale, size ); } else t->rc->slice_size_planned = 0; } if( rc->b_vbv && rc->frame_size_planned ) { threads_normalize_predictors( h ); if( rc->single_frame_vbv ) { /* Compensate for our max frame error threshold: give more bits (proportionally) to smaller slices. */ for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; float max_frame_error = x264_clip3f( 1.0 / (t->i_threadslice_end - t->i_threadslice_start), 0.05, 0.25 ); t->rc->slice_size_planned += 2 * max_frame_error * rc->frame_size_planned; } threads_normalize_predictors( h ); } for( int i = 0; i < h->param.i_threads; i++ ) h->thread[i]->rc->frame_size_estimated = h->thread[i]->rc->slice_size_planned; } } void x264_threads_merge_ratecontrol( x264_t *h ) { x264_ratecontrol_t *rc = h->rc; x264_emms(); for( int i = 0; i < h->param.i_threads; i++ ) { x264_t *t = h->thread[i]; x264_ratecontrol_t *rct = h->thread[i]->rc; if( h->param.rc.i_vbv_buffer_size ) { int size = 0; for( int row = t->i_threadslice_start; row < t->i_threadslice_end; row++ ) size += h->fdec->i_row_satd[row]; int bits = t->stat.frame.i_mv_bits + t->stat.frame.i_tex_bits + t->stat.frame.i_misc_bits; int mb_count = (t->i_threadslice_end - t->i_threadslice_start) * h->mb.i_mb_width; update_predictor( &rc->pred[h->sh.i_type+(i+1)*5], qp2qscale( rct->qpa_rc/mb_count ), size, bits ); } if( !i ) continue; rc->qpa_rc += rct->qpa_rc; rc->qpa_aq += rct->qpa_aq; } } void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next ) { if( cur != prev ) { #define COPY(var) memcpy(&cur->rc->var, &prev->rc->var, sizeof(cur->rc->var)) /* these vars are updated in x264_ratecontrol_start() * so copy them from the context that most recently started (prev) * to the context that's about to start (cur). */ COPY(accum_p_qp); COPY(accum_p_norm); COPY(last_satd); COPY(last_rceq); COPY(last_qscale_for); COPY(last_non_b_pict_type); COPY(short_term_cplxsum); COPY(short_term_cplxcount); COPY(bframes); COPY(prev_zone); COPY(mbtree.qpbuf_pos); /* these vars can be updated by x264_ratecontrol_init_reconfigurable */ COPY(bitrate); COPY(buffer_size); COPY(buffer_rate); COPY(vbv_max_rate); COPY(single_frame_vbv); COPY(cbr_decay); COPY(rate_factor_constant); COPY(rate_factor_max_increment); #undef COPY } if( cur != next ) { #define COPY(var) next->rc->var = cur->rc->var /* these vars are updated in x264_ratecontrol_end() * so copy them from the context that most recently ended (cur) * to the context that's about to end (next) */ COPY(cplxr_sum); COPY(expected_bits_sum); COPY(filler_bits_sum); COPY(wanted_bits_window); COPY(bframe_bits); COPY(initial_cpb_removal_delay); COPY(initial_cpb_removal_delay_offset); COPY(nrt_first_access_unit); COPY(previous_cpb_final_arrival_time); #undef COPY } //FIXME row_preds[] (not strictly necessary, but would improve prediction) /* the rest of the variables are either constant or thread-local */ } static int find_underflow( x264_t *h, double *fills, int *t0, int *t1, int over ) { /* find an interval ending on an overflow or underflow (depending on whether * we're adding or removing bits), and starting on the earliest frame that * can influence the buffer fill of that end frame. */ x264_ratecontrol_t *rcc = h->rc; const double buffer_min = .1 * rcc->buffer_size; const double buffer_max = .9 * rcc->buffer_size; double fill = fills[*t0-1]; double parity = over ? 1. : -1.; int start = -1, end = -1; for( int i = *t0; i < rcc->num_entries; i++ ) { fill += (rcc->entry_out[i]->i_cpb_duration * rcc->vbv_max_rate * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale - qscale2bits( rcc->entry_out[i], rcc->entry_out[i]->new_qscale )) * parity; fill = x264_clip3f(fill, 0, rcc->buffer_size); fills[i] = fill; if( fill <= buffer_min || i == 0 ) { if( end >= 0 ) break; start = i; } else if( fill >= buffer_max && start >= 0 ) end = i; } *t0 = start; *t1 = end; return start >= 0 && end >= 0; } static int fix_underflow( x264_t *h, int t0, int t1, double adjustment, double qscale_min, double qscale_max ) { x264_ratecontrol_t *rcc = h->rc; double qscale_orig, qscale_new; int adjusted = 0; if( t0 > 0 ) t0++; for( int i = t0; i <= t1; i++ ) { qscale_orig = rcc->entry_out[i]->new_qscale; qscale_orig = x264_clip3f( qscale_orig, qscale_min, qscale_max ); qscale_new = qscale_orig * adjustment; qscale_new = x264_clip3f( qscale_new, qscale_min, qscale_max ); rcc->entry_out[i]->new_qscale = qscale_new; adjusted = adjusted || (qscale_new != qscale_orig); } return adjusted; } static double count_expected_bits( x264_t *h ) { x264_ratecontrol_t *rcc = h->rc; double expected_bits = 0; for( int i = 0; i < rcc->num_entries; i++ ) { ratecontrol_entry_t *rce = rcc->entry_out[i]; rce->expected_bits = expected_bits; expected_bits += qscale2bits( rce, rce->new_qscale ); } return expected_bits; } static int vbv_pass2( x264_t *h, double all_available_bits ) { /* for each interval of buffer_full .. underflow, uniformly increase the qp of all * frames in the interval until either buffer is full at some intermediate frame or the * last frame in the interval no longer underflows. Recompute intervals and repeat. * Then do the converse to put bits back into overflow areas until target size is met */ x264_ratecontrol_t *rcc = h->rc; double *fills; double expected_bits = 0; double adjustment; double prev_bits = 0; int t0, t1; double qscale_min = qp2qscale( h->param.rc.i_qp_min ); double qscale_max = qp2qscale( h->param.rc.i_qp_max ); int iterations = 0; int adj_min, adj_max; CHECKED_MALLOC( fills, (rcc->num_entries+1)*sizeof(double) ); fills++; /* adjust overall stream size */ do { iterations++; prev_bits = expected_bits; if( expected_bits ) { /* not first iteration */ adjustment = X264_MAX(X264_MIN(expected_bits / all_available_bits, 0.999), 0.9); fills[-1] = rcc->buffer_size * h->param.rc.f_vbv_buffer_init; t0 = 0; /* fix overflows */ adj_min = 1; while( adj_min && find_underflow( h, fills, &t0, &t1, 1 ) ) { adj_min = fix_underflow( h, t0, t1, adjustment, qscale_min, qscale_max ); t0 = t1; } } fills[-1] = rcc->buffer_size * (1. - h->param.rc.f_vbv_buffer_init); t0 = 0; /* fix underflows -- should be done after overflow, as we'd better undersize target than underflowing VBV */ adj_max = 1; while( adj_max && find_underflow( h, fills, &t0, &t1, 0 ) ) adj_max = fix_underflow( h, t0, t1, 1.001, qscale_min, qscale_max ); expected_bits = count_expected_bits( h ); } while( (expected_bits < .995*all_available_bits) && ((int64_t)(expected_bits+.5) > (int64_t)(prev_bits+.5)) ); if( !adj_max ) x264_log( h, X264_LOG_WARNING, "vbv-maxrate issue, qpmax or vbv-maxrate too low\n"); /* store expected vbv filling values for tracking when encoding */ for( int i = 0; i < rcc->num_entries; i++ ) rcc->entry_out[i]->expected_vbv = rcc->buffer_size - fills[i]; x264_free( fills-1 ); return 0; fail: return -1; } static int init_pass2( x264_t *h ) { x264_ratecontrol_t *rcc = h->rc; uint64_t all_const_bits = 0; double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; double duration = 0; for( int i = 0; i < rcc->num_entries; i++ ) duration += rcc->entry[i].i_duration; duration *= timescale; uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration; double rate_factor, step_mult; double qblur = h->param.rc.f_qblur; double cplxblur = h->param.rc.f_complexity_blur; const int filter_size = (int)(qblur*4) | 1; double expected_bits; double *qscale, *blurred_qscale; double base_cplx = h->mb.i_mb_count * (h->param.i_bframe ? 120 : 80); /* find total/average complexity & const_bits */ for( int i = 0; i < rcc->num_entries; i++ ) { ratecontrol_entry_t *rce = &rcc->entry[i]; all_const_bits += rce->misc_bits; } if( all_available_bits < all_const_bits) { x264_log( h, X264_LOG_ERROR, "requested bitrate is too low. estimated minimum is %d kbps\n", (int)(all_const_bits * rcc->fps / (rcc->num_entries * 1000.)) ); return -1; } /* Blur complexities, to reduce local fluctuation of QP. * We don't blur the QPs directly, because then one very simple frame * could drag down the QP of a nearby complex frame and give it more * bits than intended. */ for( int i = 0; i < rcc->num_entries; i++ ) { ratecontrol_entry_t *rce = &rcc->entry[i]; double weight_sum = 0; double cplx_sum = 0; double weight = 1.0; double gaussian_weight; /* weighted average of cplx of future frames */ for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ ) { ratecontrol_entry_t *rcj = &rcc->entry[i+j]; double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION; weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 ); if( weight < .0001 ) break; gaussian_weight = weight * exp( -j*j/200.0 ); weight_sum += gaussian_weight; cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration; } /* weighted average of cplx of past frames */ weight = 1.0; for( int j = 0; j <= cplxblur*2 && j <= i; j++ ) { ratecontrol_entry_t *rcj = &rcc->entry[i-j]; double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION; gaussian_weight = weight * exp( -j*j/200.0 ); weight_sum += gaussian_weight; cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration; weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 ); if( weight < .0001 ) break; } rce->blurred_complexity = cplx_sum / weight_sum; } CHECKED_MALLOC( qscale, sizeof(double)*rcc->num_entries ); if( filter_size > 1 ) CHECKED_MALLOC( blurred_qscale, sizeof(double)*rcc->num_entries ); else blurred_qscale = qscale; /* Search for a factor which, when multiplied by the RCEQ values from * each frame, adds up to the desired total size. * There is no exact closed-form solution because of VBV constraints and * because qscale2bits is not invertible, but we can start with the simple * approximation of scaling the 1st pass by the ratio of bitrates. * The search range is probably overkill, but speed doesn't matter here. */ expected_bits = 1; for( int i = 0; i < rcc->num_entries; i++ ) { double q = get_qscale(h, &rcc->entry[i], 1.0, i); expected_bits += qscale2bits(&rcc->entry[i], q); rcc->last_qscale_for[rcc->entry[i].pict_type] = q; } step_mult = all_available_bits / expected_bits; rate_factor = 0; for( double step = 1E4 * step_mult; step > 1E-7 * step_mult; step *= 0.5) { expected_bits = 0; rate_factor += step; rcc->last_non_b_pict_type = -1; rcc->last_accum_p_norm = 1; rcc->accum_p_norm = 0; rcc->last_qscale_for[0] = rcc->last_qscale_for[1] = rcc->last_qscale_for[2] = pow( base_cplx, 1 - rcc->qcompress ) / rate_factor; /* find qscale */ for( int i = 0; i < rcc->num_entries; i++ ) { qscale[i] = get_qscale( h, &rcc->entry[i], rate_factor, -1 ); rcc->last_qscale_for[rcc->entry[i].pict_type] = qscale[i]; } /* fixed I/B qscale relative to P */ for( int i = rcc->num_entries-1; i >= 0; i-- ) { qscale[i] = get_diff_limited_q( h, &rcc->entry[i], qscale[i], i ); assert(qscale[i] >= 0); } /* smooth curve */ if( filter_size > 1 ) { assert( filter_size%2 == 1 ); for( int i = 0; i < rcc->num_entries; i++ ) { ratecontrol_entry_t *rce = &rcc->entry[i]; double q = 0.0, sum = 0.0; for( int j = 0; j < filter_size; j++ ) { int idx = i+j-filter_size/2; double d = idx-i; double coeff = qblur==0 ? 1.0 : exp( -d*d/(qblur*qblur) ); if( idx < 0 || idx >= rcc->num_entries ) continue; if( rce->pict_type != rcc->entry[idx].pict_type ) continue; q += qscale[idx] * coeff; sum += coeff; } blurred_qscale[i] = q/sum; } } /* find expected bits */ for( int i = 0; i < rcc->num_entries; i++ ) { ratecontrol_entry_t *rce = &rcc->entry[i]; rce->new_qscale = clip_qscale( h, rce->pict_type, blurred_qscale[i] ); assert(rce->new_qscale >= 0); expected_bits += qscale2bits( rce, rce->new_qscale ); } if( expected_bits > all_available_bits ) rate_factor -= step; } x264_free( qscale ); if( filter_size > 1 ) x264_free( blurred_qscale ); if( rcc->b_vbv ) if( vbv_pass2( h, all_available_bits ) ) return -1; expected_bits = count_expected_bits( h ); if( fabs( expected_bits/all_available_bits - 1.0 ) > 0.01 ) { double avgq = 0; for( int i = 0; i < rcc->num_entries; i++ ) avgq += rcc->entry[i].new_qscale; avgq = qscale2qp( avgq / rcc->num_entries ); if( expected_bits > all_available_bits || !rcc->b_vbv ) x264_log( h, X264_LOG_WARNING, "Error: 2pass curve failed to converge\n" ); x264_log( h, X264_LOG_WARNING, "target: %.2f kbit/s, expected: %.2f kbit/s, avg QP: %.4f\n", (float)h->param.rc.i_bitrate, expected_bits * rcc->fps / (rcc->num_entries * 1000.), avgq ); if( expected_bits < all_available_bits && avgq < h->param.rc.i_qp_min + 2 ) { if( h->param.rc.i_qp_min > 0 ) x264_log( h, X264_LOG_WARNING, "try reducing target bitrate or reducing qp_min (currently %d)\n", h->param.rc.i_qp_min ); else x264_log( h, X264_LOG_WARNING, "try reducing target bitrate\n" ); } else if( expected_bits > all_available_bits && avgq > h->param.rc.i_qp_max - 2 ) { if( h->param.rc.i_qp_max < QP_MAX ) x264_log( h, X264_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", h->param.rc.i_qp_max ); else x264_log( h, X264_LOG_WARNING, "try increasing target bitrate\n"); } else if( !(rcc->b_2pass && rcc->b_vbv) ) x264_log( h, X264_LOG_WARNING, "internal error\n" ); } return 0; fail: return -1; } x264-master/encoder/ratecontrol.h000066400000000000000000000107761502133446700172160ustar00rootroot00000000000000/***************************************************************************** * ratecontrol.h: ratecontrol ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ENCODER_RATECONTROL_H #define X264_ENCODER_RATECONTROL_H /* Completely arbitrary. Ratecontrol lowers relative quality at higher framerates * and the reverse at lower framerates; this serves as the center of the curve. * Halve all the values for frame-packed 3D to compensate for the "doubled" * framerate. */ #define BASE_FRAME_DURATION (0.04f / ((h->param.i_frame_packing == 5)+1)) /* Arbitrary limitations as a sanity check. */ #define MAX_FRAME_DURATION (1.00f / ((h->param.i_frame_packing == 5)+1)) #define MIN_FRAME_DURATION (0.01f / ((h->param.i_frame_packing == 5)+1)) #define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION) #define x264_ratecontrol_new x264_template(ratecontrol_new) int x264_ratecontrol_new ( x264_t * ); #define x264_ratecontrol_delete x264_template(ratecontrol_delete) void x264_ratecontrol_delete( x264_t * ); #define x264_ratecontrol_init_reconfigurable x264_template(ratecontrol_init_reconfigurable) void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init ); #define x264_encoder_reconfig_apply x264_template(encoder_reconfig_apply) int x264_encoder_reconfig_apply( x264_t *h, x264_param_t *param ); #define x264_adaptive_quant_frame x264_template(adaptive_quant_frame) void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets ); #define x264_macroblock_tree_read x264_template(macroblock_tree_read) int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offsets ); #define x264_reference_build_list_optimal x264_template(reference_build_list_optimal) int x264_reference_build_list_optimal( x264_t *h ); #define x264_thread_sync_ratecontrol x264_template(thread_sync_ratecontrol) void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next ); #define x264_ratecontrol_zone_init x264_template(ratecontrol_zone_init) void x264_ratecontrol_zone_init( x264_t * ); #define x264_ratecontrol_start x264_template(ratecontrol_start) void x264_ratecontrol_start( x264_t *, int i_force_qp, int overhead ); #define x264_ratecontrol_slice_type x264_template(ratecontrol_slice_type) int x264_ratecontrol_slice_type( x264_t *, int i_frame ); #define x264_ratecontrol_set_weights x264_template(ratecontrol_set_weights) void x264_ratecontrol_set_weights( x264_t *h, x264_frame_t *frm ); #define x264_ratecontrol_mb x264_template(ratecontrol_mb) int x264_ratecontrol_mb( x264_t *, int bits ); #define x264_ratecontrol_qp x264_template(ratecontrol_qp) int x264_ratecontrol_qp( x264_t * ); #define x264_ratecontrol_mb_qp x264_template(ratecontrol_mb_qp) int x264_ratecontrol_mb_qp( x264_t *h ); #define x264_ratecontrol_end x264_template(ratecontrol_end) int x264_ratecontrol_end( x264_t *, int bits, int *filler ); #define x264_ratecontrol_summary x264_template(ratecontrol_summary) void x264_ratecontrol_summary( x264_t * ); #define x264_rc_analyse_slice x264_template(rc_analyse_slice) int x264_rc_analyse_slice( x264_t *h ); #define x264_threads_distribute_ratecontrol x264_template(threads_distribute_ratecontrol) void x264_threads_distribute_ratecontrol( x264_t *h ); #define x264_threads_merge_ratecontrol x264_template(threads_merge_ratecontrol) void x264_threads_merge_ratecontrol( x264_t *h ); #define x264_hrd_fullness x264_template(hrd_fullness) void x264_hrd_fullness( x264_t *h ); #endif x264-master/encoder/rdo.c000066400000000000000000001377111502133446700154400ustar00rootroot00000000000000/***************************************************************************** * rdo.c: rate-distortion optimization ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Loren Merritt * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ /* duplicate all the writer functions, just calculating bit cost * instead of writing the bitstream. * TODO: use these for fast 1st pass too. */ #define RDO_SKIP_BS 1 /* Transition and size tables for abs<9 MVD and residual coding */ /* Consist of i_prefix-2 1s, one zero, and a bypass sign bit */ #define x264_cabac_transition_unary x264_template(cabac_transition_unary) uint8_t x264_cabac_transition_unary[15][128]; #define x264_cabac_size_unary x264_template(cabac_size_unary) uint16_t x264_cabac_size_unary[15][128]; /* Transition and size tables for abs>9 MVD */ /* Consist of 5 1s and a bypass sign bit */ static uint8_t cabac_transition_5ones[128]; static uint16_t cabac_size_5ones[128]; /* CAVLC: produces exactly the same bit count as a normal encode */ /* this probably still leaves some unnecessary computations */ #define bs_write1(s,v) ((s)->i_bits_encoded += 1) #define bs_write(s,n,v) ((s)->i_bits_encoded += (n)) #define bs_write_ue(s,v) ((s)->i_bits_encoded += bs_size_ue(v)) #define bs_write_se(s,v) ((s)->i_bits_encoded += bs_size_se(v)) #define bs_write_te(s,v,l) ((s)->i_bits_encoded += bs_size_te(v,l)) #undef x264_macroblock_write_cavlc #define x264_macroblock_write_cavlc static macroblock_size_cavlc #include "cavlc.c" /* CABAC: not exactly the same. x264_cabac_size_decision() keeps track of * fractional bits, but only finite precision. */ #undef x264_cabac_encode_decision #undef x264_cabac_encode_decision_noup #undef x264_cabac_encode_bypass #undef x264_cabac_encode_terminal #undef x264_cabac_encode_ue_bypass #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v) #define x264_cabac_encode_decision_noup(c,x,v) x264_cabac_size_decision_noup(c,x,v) #define x264_cabac_encode_terminal(c) ((c)->f8_bits_encoded += 7) #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256) #define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ sizeof(int) + (CHROMA444 ? 1024+12 : 460) ) #define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size ) static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y ) { static const uint8_t hadamard_shift_x[4] = {4, 4, 3, 3}; static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1}; static const uint8_t hadamard_offset[4] = {0, 1, 3, 5}; int cache_index = (x >> hadamard_shift_x[size]) + (y >> hadamard_shift_y[size]) + hadamard_offset[size]; uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index]; if( res ) return res - 1; else { pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE; res = h->pixf.hadamard_ac[size]( fenc, FENC_STRIDE ); h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1; return res; } } static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y ) { static const uint8_t satd_shift_x[3] = {3, 2, 2}; static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2}; static const uint8_t satd_offset[3] = {0, 8, 16}; int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4]) + satd_offset[size - PIXEL_8x4]; int res = h->mb.pic.fenc_satd_cache[cache_index]; if( res ) return res - 1; else { pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE; int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) >> 1; res = h->pixf.satd[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) - dc; h->mb.pic.fenc_satd_cache[cache_index] = res + 1; return res; } } /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */ /* SATD and SA8D are used to measure block complexity. */ /* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */ /* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */ /* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */ /* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */ /* This optimization can also be used in non-RD transform decision. */ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y ) { int satd = 0; pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE; pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE; if( p == 0 && h->mb.i_psy_rd ) { /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */ if( size <= PIXEL_8x8 ) { uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE ); uint64_t fenc_acs = cached_hadamard( h, size, x, y ); satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs) + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32)); satd >>= 1; } else { int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) >> 1; satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) - dc - cached_satd( h, size, x, y )); } int64_t tmp = ((int64_t)satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8; satd = X264_MIN( tmp, COST_MAX ); } return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd; } static inline int ssd_mb( x264_t *h ) { int i_ssd = ssd_plane( h, PIXEL_16x16, 0, 0, 0 ); if( CHROMA_FORMAT ) { int chroma_size = h->luma2chroma_pixel[PIXEL_16x16]; int chroma_ssd = ssd_plane( h, chroma_size, 1, 0, 0 ) + ssd_plane( h, chroma_size, 2, 0, 0 ); i_ssd += ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8; } return i_ssd; } static int rd_cost_mb( x264_t *h, int i_lambda2 ) { int b_transform_bak = h->mb.b_transform_8x8; int i_ssd; int i_bits; int type_bak = h->mb.i_type; x264_macroblock_encode( h ); if( h->mb.b_deblock_rdo ) x264_macroblock_deblock( h ); i_ssd = ssd_mb( h ); if( IS_SKIP( h->mb.i_type ) ) { i_bits = (1 * i_lambda2 + 128) >> 8; } else if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; COPY_CABAC; macroblock_size_cabac( h, &cabac_tmp ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; } else { macroblock_size_cavlc( h ); i_bits = ( (uint64_t)h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8; } h->mb.b_transform_8x8 = b_transform_bak; h->mb.i_type = type_bak; return X264_MIN( i_ssd + i_bits, COST_MAX ); } /* partition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */ static uint64_t rd_cost_subpart( x264_t *h, int i_lambda2, int i4, int i_pixel ) { uint64_t i_ssd, i_bits; x264_macroblock_encode_p4x4( h, i4 ); if( i_pixel == PIXEL_8x4 ) x264_macroblock_encode_p4x4( h, i4+1 ); if( i_pixel == PIXEL_4x8 ) x264_macroblock_encode_p4x4( h, i4+2 ); i_ssd = ssd_plane( h, i_pixel, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 ); if( CHROMA444 ) { int chromassd = ssd_plane( h, i_pixel, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 ) + ssd_plane( h, i_pixel, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 ); chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8; i_ssd += chromassd; } if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; COPY_CABAC; subpartition_size_cabac( h, &cabac_tmp, i4, i_pixel ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else i_bits = subpartition_size_cavlc( h, i4, i_pixel ); return (i_ssd<<8) + i_bits; } uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel ) { uint64_t i_ssd, i_bits; int i8 = i4 >> 2; if( i_pixel == PIXEL_16x16 ) { int i_cost = rd_cost_mb( h, i_lambda2 ); return i_cost; } if( i_pixel > PIXEL_8x8 ) return rd_cost_subpart( h, i_lambda2, i4, i_pixel ); h->mb.i_cbp_luma = 0; x264_macroblock_encode_p8x8( h, i8 ); if( i_pixel == PIXEL_16x8 ) x264_macroblock_encode_p8x8( h, i8+1 ); if( i_pixel == PIXEL_8x16 ) x264_macroblock_encode_p8x8( h, i8+2 ); int ssd_x = 8*(i8&1); int ssd_y = 8*(i8>>1); i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y ); if( CHROMA_FORMAT ) { int chroma_size = h->luma2chroma_pixel[i_pixel]; int chroma_ssd = ssd_plane( h, chroma_size, 1, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT ) + ssd_plane( h, chroma_size, 2, ssd_x>>CHROMA_H_SHIFT, ssd_y>>CHROMA_V_SHIFT ); i_ssd += ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8; } if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; COPY_CABAC; partition_size_cabac( h, &cabac_tmp, i8, i_pixel ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else i_bits = (uint64_t)partition_size_cavlc( h, i8, i_pixel ) * i_lambda2; return (i_ssd<<8) + i_bits; } static uint64_t rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[4][32] ) { uint64_t i_ssd, i_bits; int plane_count = CHROMA444 ? 3 : 1; int i_qp = h->mb.i_qp; h->mb.i_cbp_luma &= ~(1<mb.b_transform_8x8 = 1; for( int p = 0; p < plane_count; p++ ) { x264_mb_encode_i8x8( h, p, i8, i_qp, i_mode, edge[p], 1 ); i_qp = h->mb.i_chroma_qp; } i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 ); if( CHROMA444 ) { int chromassd = ssd_plane( h, PIXEL_8x8, 1, (i8&1)*8, (i8>>1)*8 ) + ssd_plane( h, PIXEL_8x8, 2, (i8&1)*8, (i8>>1)*8 ); chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8; i_ssd += chromassd; } if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; COPY_CABAC; partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else i_bits = (uint64_t)partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2; return (i_ssd<<8) + i_bits; } static uint64_t rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) { uint64_t i_ssd, i_bits; int plane_count = CHROMA444 ? 3 : 1; int i_qp = h->mb.i_qp; for( int p = 0; p < plane_count; p++ ) { x264_mb_encode_i4x4( h, p, i4, i_qp, i_mode, 1 ); i_qp = h->mb.i_chroma_qp; } i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 ); if( CHROMA444 ) { int chromassd = ssd_plane( h, PIXEL_4x4, 1, block_idx_x[i4]*4, block_idx_y[i4]*4 ) + ssd_plane( h, PIXEL_4x4, 2, block_idx_x[i4]*4, block_idx_y[i4]*4 ); chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8; i_ssd += chromassd; } if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; COPY_CABAC; partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else i_bits = (uint64_t)partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2; return (i_ssd<<8) + i_bits; } static uint64_t rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) { uint64_t i_ssd, i_bits; if( b_dct ) x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; i_ssd = ssd_plane( h, chromapix, 1, 0, 0 ) + ssd_plane( h, chromapix, 2, 0, 0 ); h->mb.i_chroma_pred_mode = i_mode; if( h->param.b_cabac ) { x264_cabac_t cabac_tmp; COPY_CABAC; chroma_size_cabac( h, &cabac_tmp ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else i_bits = (uint64_t)chroma_size_cavlc( h ) * i_lambda2; return (i_ssd<<8) + i_bits; } /**************************************************************************** * Trellis RD quantization ****************************************************************************/ #define TRELLIS_SCORE_MAX (~0ULL) // marks the node as invalid #define TRELLIS_SCORE_BIAS (1ULL<<60) // bias so that all valid scores are positive, even after negative contributions from psy #define CABAC_SIZE_BITS 8 #define LAMBDA_BITS 4 /* precalculate the cost of coding various combinations of bits in a single context */ void x264_rdo_init( void ) { for( int i_prefix = 0; i_prefix < 15; i_prefix++ ) { for( int i_ctx = 0; i_ctx < 128; i_ctx++ ) { int f8_bits = 0; uint8_t ctx = i_ctx; for( int i = 1; i < i_prefix; i++ ) f8_bits += x264_cabac_size_decision2( &ctx, 1 ); if( i_prefix > 0 && i_prefix < 14 ) f8_bits += x264_cabac_size_decision2( &ctx, 0 ); f8_bits += 1 << CABAC_SIZE_BITS; //sign x264_cabac_size_unary[i_prefix][i_ctx] = f8_bits; x264_cabac_transition_unary[i_prefix][i_ctx] = ctx; } } for( int i_ctx = 0; i_ctx < 128; i_ctx++ ) { int f8_bits = 0; uint8_t ctx = i_ctx; for( int i = 0; i < 5; i++ ) f8_bits += x264_cabac_size_decision2( &ctx, 1 ); f8_bits += 1 << CABAC_SIZE_BITS; //sign cabac_size_5ones[i_ctx] = f8_bits; cabac_transition_5ones[i_ctx] = ctx; } } typedef struct { uint64_t score; int level_idx; // index into level_tree[] uint8_t cabac_state[4]; // just contexts 0,4,8,9 of the 10 relevant to coding abs_level_m1 } trellis_node_t; typedef struct { uint16_t next; uint16_t abs_level; } trellis_level_t; // TODO: // save cabac state between blocks? // use trellis' RD score instead of x264_mb_decimate_score? // code 8x8 sig/last flags forwards with deadzone and save the contexts at // each position? // change weights when using CQMs? // possible optimizations: // make scores fit in 32bit // save quantized coefs during rd, to avoid a duplicate trellis in the final encode // if trellissing all MBRD modes, finish SSD calculation so we can skip all of // the normal dequant/idct/ssd/cabac // the unquant_mf here is not the same as dequant_mf: // in normal operation (dct->quant->dequant->idct) the dct and idct are not // normalized. quant/dequant absorb those scaling factors. // in this function, we just do (quant->unquant) and want the output to be // comparable to the input. so unquant is the direct inverse of quant, // and uses the dct scaling factors, not the idct ones. #define SIGN(x,y) ((x^(y >> 31))-(y >> 31)) #define SET_LEVEL(ndst, nsrc, l) {\ if( sizeof(trellis_level_t) == sizeof(uint32_t) )\ M32( &level_tree[levels_used] ) = pack16to32( nsrc.level_idx, l );\ else\ level_tree[levels_used] = (trellis_level_t){ nsrc.level_idx, l };\ ndst.level_idx = levels_used;\ levels_used++;\ } // encode all values of the dc coef in a block which is known to have no ac static NOINLINE int trellis_dc_shortcut( int sign_coef, int quant_coef, int unquant_mf, int coef_weight, int lambda2, uint8_t *cabac_state, int cost_sig ) { uint64_t bscore = TRELLIS_SCORE_MAX; int ret = 0; int q = abs( quant_coef ); for( int abs_level = q-1; abs_level <= q; abs_level++ ) { int unquant_abs_level = (unquant_mf * abs_level + 128) >> 8; /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */ int d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15); uint64_t score = (int64_t)d*d * coef_weight; /* code the proposed level, and count how much entropy it would take */ if( abs_level ) { unsigned f8_bits = cost_sig; int prefix = X264_MIN( abs_level - 1, 14 ); f8_bits += x264_cabac_size_decision_noup2( cabac_state+1, prefix > 0 ); f8_bits += x264_cabac_size_unary[prefix][cabac_state[5]]; if( abs_level >= 15 ) f8_bits += bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS; score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); } COPY2_IF_LT( bscore, score, ret, abs_level ); } return SIGN(ret, sign_coef); } // encode one value of one coef in one context static ALWAYS_INLINE int trellis_coef( int j, int const_level, int abs_level, int prefix, int suffix_cost, int node_ctx, int level1_ctx, int levelgt1_ctx, uint64_t ssd, int cost_siglast[3], trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state ) { uint64_t score = nodes_prev[j].score + ssd; /* code the proposed level, and count how much entropy it would take */ unsigned f8_bits = cost_siglast[ j ? 1 : 2 ]; uint8_t level1_state = (j >= 3) ? nodes_prev[j].cabac_state[level1_ctx>>2] : level_state[level1_ctx]; f8_bits += x264_cabac_entropy[level1_state ^ (const_level > 1)]; uint8_t levelgt1_state; if( const_level > 1 ) { levelgt1_state = j >= 6 ? nodes_prev[j].cabac_state[levelgt1_ctx-6] : level_state[levelgt1_ctx]; f8_bits += x264_cabac_size_unary[prefix][levelgt1_state] + suffix_cost; } else f8_bits += 1 << CABAC_SIZE_BITS; score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); /* save the node if it's better than any existing node with the same cabac ctx */ if( score < nodes_cur[node_ctx].score ) { nodes_cur[node_ctx].score = score; if( j == 2 || (j <= 3 && node_ctx == 4) ) // init from input state M32(nodes_cur[node_ctx].cabac_state) = M32(level_state+12); else if( j >= 3 ) M32(nodes_cur[node_ctx].cabac_state) = M32(nodes_prev[j].cabac_state); if( j >= 3 ) // skip the transition if we're not going to reuse the context nodes_cur[node_ctx].cabac_state[level1_ctx>>2] = x264_cabac_transition[level1_state][const_level > 1]; if( const_level > 1 && node_ctx == 7 ) nodes_cur[node_ctx].cabac_state[levelgt1_ctx-6] = x264_cabac_transition_unary[prefix][levelgt1_state]; nodes_cur[node_ctx].level_idx = nodes_prev[j].level_idx; SET_LEVEL( nodes_cur[node_ctx], nodes_prev[j], abs_level ); } return levels_used; } // encode one value of one coef in all contexts, templated by which value that is. // in ctx_lo, the set of live nodes is contiguous and starts at ctx0, so return as soon as we've seen one failure. // in ctx_hi, they're contiguous within each block of 4 ctxs, but not necessarily starting at the beginning, // so exploiting that would be more complicated. static NOINLINE int trellis_coef0_0( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used ) { nodes_cur[0].score = nodes_prev[0].score + ssd0; nodes_cur[0].level_idx = nodes_prev[0].level_idx; for( int j = 1; j < 4 && (int64_t)nodes_prev[j].score >= 0; j++ ) { nodes_cur[j].score = nodes_prev[j].score; if( j >= 3 ) M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state); SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 ); } return levels_used; } static NOINLINE int trellis_coef0_1( uint64_t ssd0, trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used ) { for( int j = 1; j < 8; j++ ) // this branch only affects speed, not function; there's nothing wrong with updating invalid nodes in coef0. if( (int64_t)nodes_prev[j].score >= 0 ) { nodes_cur[j].score = nodes_prev[j].score; if( j >= 3 ) M32(nodes_cur[j].cabac_state) = M32(nodes_prev[j].cabac_state); SET_LEVEL( nodes_cur[j], nodes_prev[j], 0 ); } return levels_used; } #define COEF(const_level, ctx_hi, j, ...)\ if( !j || (int64_t)nodes_prev[j].score >= 0 )\ levels_used = trellis_coef( j, const_level, abs_level, prefix, suffix_cost, __VA_ARGS__,\ j?ssd1:ssd0, cost_siglast, nodes_cur, nodes_prev,\ level_tree, levels_used, lambda2, level_state );\ else if( !ctx_hi )\ return levels_used; static NOINLINE int trellis_coef1_0( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state ) { int abs_level = 1, prefix = 1, suffix_cost = 0; COEF( 1, 0, 0, 1, 1, 0 ); COEF( 1, 0, 1, 2, 2, 0 ); COEF( 1, 0, 2, 3, 3, 0 ); COEF( 1, 0, 3, 3, 4, 0 ); return levels_used; } static NOINLINE int trellis_coef1_1( uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state ) { int abs_level = 1, prefix = 1, suffix_cost = 0; COEF( 1, 1, 1, 2, 2, 0 ); COEF( 1, 1, 2, 3, 3, 0 ); COEF( 1, 1, 3, 3, 4, 0 ); COEF( 1, 1, 4, 4, 0, 0 ); COEF( 1, 1, 5, 5, 0, 0 ); COEF( 1, 1, 6, 6, 0, 0 ); COEF( 1, 1, 7, 7, 0, 0 ); return levels_used; } static NOINLINE int trellis_coefn_0( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state, int levelgt1_ctx ) { int prefix = X264_MIN( abs_level-1, 14 ); int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0; COEF( 2, 0, 0, 4, 1, 5 ); COEF( 2, 0, 1, 4, 2, 5 ); COEF( 2, 0, 2, 4, 3, 5 ); COEF( 2, 0, 3, 4, 4, 5 ); return levels_used; } static NOINLINE int trellis_coefn_1( int abs_level, uint64_t ssd0, uint64_t ssd1, int cost_siglast[3], trellis_node_t *nodes_cur, trellis_node_t *nodes_prev, trellis_level_t *level_tree, int levels_used, int lambda2, uint8_t *level_state, int levelgt1_ctx ) { int prefix = X264_MIN( abs_level-1, 14 ); int suffix_cost = abs_level >= 15 ? bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS : 0; COEF( 2, 1, 1, 4, 2, 5 ); COEF( 2, 1, 2, 4, 3, 5 ); COEF( 2, 1, 3, 4, 4, 5 ); COEF( 2, 1, 4, 5, 0, 6 ); COEF( 2, 1, 5, 6, 0, 7 ); COEF( 2, 1, 6, 7, 0, 8 ); COEF( 2, 1, 7, 7, 0, levelgt1_ctx ); return levels_used; } static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct, udctcoef *quant_mf, udctcoef *quant_bias, const int *unquant_mf, const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx ) { ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] ); ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; uint8_t *cabac_state_sig = &h->cabac.state[ x264_significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; uint8_t *cabac_state_last = &h->cabac.state[ x264_last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; int levelgt1_ctx = b_chroma && dc ? 8 : 9; if( dc ) { if( num_coefs == 16 ) { memcpy( orig_coefs, dct, sizeof(dctcoef)*16 ); if( !h->quantf.quant_4x4_dc( dct, quant_mf[0] >> 1, quant_bias[0] << 1 ) ) return 0; h->zigzagf.scan_4x4( quant_coefs, dct ); } else { memcpy( orig_coefs, dct, sizeof(dctcoef)*num_coefs ); int nz = h->quantf.quant_2x2_dc( &dct[0], quant_mf[0] >> 1, quant_bias[0] << 1 ); if( num_coefs == 8 ) nz |= h->quantf.quant_2x2_dc( &dct[4], quant_mf[0] >> 1, quant_bias[0] << 1 ); if( !nz ) return 0; for( int i = 0; i < num_coefs; i++ ) quant_coefs[i] = dct[zigzag[i]]; } } else { if( num_coefs == 64 ) { h->mc.memcpy_aligned( orig_coefs, dct, sizeof(dctcoef)*64 ); if( !h->quantf.quant_8x8( dct, quant_mf, quant_bias ) ) return 0; h->zigzagf.scan_8x8( quant_coefs, dct ); } else //if( num_coefs == 16 ) { memcpy( orig_coefs, dct, sizeof(dctcoef)*16 ); if( !h->quantf.quant_4x4( dct, quant_mf, quant_bias ) ) return 0; h->zigzagf.scan_4x4( quant_coefs, dct ); } } int last_nnz = h->quantf.coeff_last[ctx_block_cat]( quant_coefs+b_ac )+b_ac; uint8_t *cabac_state = &h->cabac.state[ x264_coeff_abs_level_m1_offset[ctx_block_cat] ]; /* shortcut for dc-only blocks. * this doesn't affect the output, but saves some unnecessary computation. */ if( last_nnz == 0 && !dc ) { int cost_sig = x264_cabac_size_decision_noup2( &cabac_state_sig[0], 1 ) + x264_cabac_size_decision_noup2( &cabac_state_last[0], 1 ); dct[0] = trellis_dc_shortcut( orig_coefs[0], quant_coefs[0], unquant_mf[0], coef_weight2[0], lambda2, cabac_state, cost_sig ); return !!dct[0]; } #if HAVE_MMX && ARCH_X86_64 uint64_t level_state0; memcpy( &level_state0, cabac_state, sizeof(uint64_t) ); uint16_t level_state1; memcpy( &level_state1, cabac_state+8, sizeof(uint16_t) ); #define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\ cabac_state_sig, cabac_state_last, level_state0, level_state1 if( num_coefs == 16 && !dc ) if( b_chroma || !h->mb.i_psy_trellis ) return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac ); else return h->quantf.trellis_cabac_4x4_psy( TRELLIS_ARGS, b_ac, h->mb.pic.fenc_dct4[idx&15], h->mb.i_psy_trellis ); else if( num_coefs == 64 && !dc ) if( b_chroma || !h->mb.i_psy_trellis ) return h->quantf.trellis_cabac_8x8( TRELLIS_ARGS, b_interlaced ); else return h->quantf.trellis_cabac_8x8_psy( TRELLIS_ARGS, b_interlaced, h->mb.pic.fenc_dct8[idx&3], h->mb.i_psy_trellis); else if( num_coefs == 8 && dc ) return h->quantf.trellis_cabac_chroma_422_dc( TRELLIS_ARGS ); else if( dc ) return h->quantf.trellis_cabac_dc( TRELLIS_ARGS, num_coefs-1 ); #endif // (# of coefs) * (# of ctx) * (# of levels tried) = 1024 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough, // but it takes more time to remove dead states than you gain in reduced memory. trellis_level_t level_tree[64*8*2]; int levels_used = 1; /* init trellis */ trellis_node_t nodes[2][8] = {0}; trellis_node_t *nodes_cur = nodes[0]; trellis_node_t *nodes_prev = nodes[1]; trellis_node_t *bnode; for( int j = 1; j < 8; j++ ) nodes_cur[j].score = TRELLIS_SCORE_MAX; nodes_cur[0].score = TRELLIS_SCORE_BIAS; nodes_cur[0].level_idx = 0; level_tree[0].abs_level = 0; level_tree[0].next = 0; ALIGNED_4( uint8_t level_state[16] ); memcpy( level_state, cabac_state, 10 ); level_state[12] = cabac_state[0]; // packed subset for copying into trellis_node_t level_state[13] = cabac_state[4]; level_state[14] = cabac_state[8]; level_state[15] = cabac_state[9]; idx &= num_coefs == 64 ? 3 : 15; // coefs are processed in reverse order, because that's how the abs value is coded. // last_coef and significant_coef flags are normally coded in forward order, but // we have to reverse them to match the levels. // in 4x4 blocks, last_coef and significant_coef use a separate context for each // position, so the order doesn't matter, and we don't even have to update their contexts. // in 8x8 blocks, some positions share contexts, so we'll just have to hope that // cabac isn't too sensitive. int i = last_nnz; #define TRELLIS_LOOP(ctx_hi)\ for( ; i >= b_ac; i-- )\ {\ /* skip 0s: this doesn't affect the output, but saves some unnecessary computation. */\ if( !quant_coefs[i] )\ {\ /* no need to calculate ssd of 0s: it's the same in all nodes.\ * no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.\ * subtracting from one score is equivalent to adding to the rest. */\ if( !ctx_hi )\ {\ int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\ uint64_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )\ * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\ nodes_cur[0].score -= cost_sig0;\ }\ for( int j = 1; j < (ctx_hi?8:4); j++ )\ SET_LEVEL( nodes_cur[j], nodes_cur[j], 0 );\ continue;\ }\ \ int sign_coef = orig_coefs[zigzag[i]];\ int abs_coef = abs( sign_coef );\ int q = abs( quant_coefs[i] );\ int cost_siglast[3]; /* { zero, nonzero, nonzero-and-last } */\ XCHG( trellis_node_t*, nodes_cur, nodes_prev );\ for( int j = ctx_hi; j < 8; j++ )\ nodes_cur[j].score = TRELLIS_SCORE_MAX;\ \ if( i < num_coefs-1 || ctx_hi )\ {\ int sigindex = !dc && num_coefs == 64 ? x264_significant_coeff_flag_offset_8x8[b_interlaced][i] :\ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\ int lastindex = !dc && num_coefs == 64 ? x264_last_coeff_flag_offset_8x8[i] :\ b_chroma && dc && num_coefs == 8 ? x264_coeff_flag_offset_chroma_422_dc[i] : i;\ cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );\ int cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );\ cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;\ if( !ctx_hi )\ cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;\ }\ else\ {\ cost_siglast[0] = cost_siglast[1] = cost_siglast[2] = 0;\ }\ \ /* there are a few cases where increasing the coeff magnitude helps,\ * but it's only around .003 dB, and skipping them ~doubles the speed of trellis.\ * could also try q-2: that sometimes helps, but also sometimes decimates blocks\ * that are better left coded, especially at QP > 40. */\ uint64_t ssd0[2], ssd1[2];\ for( int k = 0; k < 2; k++ )\ {\ int abs_level = q-1+k;\ int unquant_abs_level = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[i]]) * abs_level + 128) >> 8);\ int d = abs_coef - unquant_abs_level;\ /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */\ if( h->mb.i_psy_trellis && i && !dc && !b_chroma )\ {\ int orig_coef = (num_coefs == 64) ? h->mb.pic.fenc_dct8[idx][zigzag[i]] : h->mb.pic.fenc_dct4[idx][zigzag[i]];\ int predicted_coef = orig_coef - sign_coef;\ int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));\ int psy_weight = coef_weight1[zigzag[i]] * h->mb.i_psy_trellis;\ int64_t tmp = (int64_t)d*d * coef_weight2[zigzag[i]] - (int64_t)psy_weight * psy_value;\ ssd1[k] = (uint64_t)tmp;\ }\ else\ /* FIXME: for i16x16 dc is this weight optimal? */\ ssd1[k] = (int64_t)d*d * (dc?256:coef_weight2[zigzag[i]]);\ ssd0[k] = ssd1[k];\ if( !i && !dc && !ctx_hi )\ {\ /* Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks. */\ d = sign_coef - ((SIGN(unquant_abs_level, sign_coef) + 8)&~15);\ ssd0[k] = (int64_t)d*d * coef_weight2[zigzag[i]];\ }\ }\ \ /* argument passing imposes some significant overhead here. gcc's interprocedural register allocation isn't up to it. */\ switch( q )\ {\ case 1:\ ssd1[0] += (uint64_t)cost_siglast[0] * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );\ levels_used = trellis_coef0_##ctx_hi( ssd0[0]-ssd1[0], nodes_cur, nodes_prev, level_tree, levels_used );\ levels_used = trellis_coef1_##ctx_hi( ssd0[1]-ssd1[0], ssd1[1]-ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\ goto next##ctx_hi;\ case 2:\ levels_used = trellis_coef1_##ctx_hi( ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state );\ levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\ goto next1;\ default:\ levels_used = trellis_coefn_##ctx_hi( q-1, ssd0[0], ssd1[0], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\ levels_used = trellis_coefn_##ctx_hi( q, ssd0[1], ssd1[1], cost_siglast, nodes_cur, nodes_prev, level_tree, levels_used, lambda2, level_state, levelgt1_ctx );\ goto next1;\ }\ next##ctx_hi:;\ }\ /* output levels from the best path through the trellis */\ bnode = &nodes_cur[ctx_hi];\ for( int j = ctx_hi+1; j < (ctx_hi?8:4); j++ )\ if( nodes_cur[j].score < bnode->score )\ bnode = &nodes_cur[j]; // keep 2 versions of the main quantization loop, depending on which subsets of the node_ctxs are live // node_ctx 0..3, i.e. having not yet encountered any coefs that might be quantized to >1 TRELLIS_LOOP(0); if( bnode == &nodes_cur[0] ) { /* We only need to zero an empty 4x4 block. 8x8 can be implicitly emptied via zero nnz, as can dc. */ if( num_coefs == 16 && !dc ) memset( dct, 0, 16 * sizeof(dctcoef) ); return 0; } if( 0 ) // accessible only by goto, not fallthrough { // node_ctx 1..7 (ctx0 ruled out because we never try both level0 and level2+ on the same coef) TRELLIS_LOOP(1); } int level = bnode->level_idx; for( i = b_ac; i <= last_nnz; i++ ) { dct[zigzag[i]] = SIGN(level_tree[level].abs_level, dct[zigzag[i]]); level = level_tree[level].next; } return 1; } /* FIXME: This is a gigantic hack. See below. * * CAVLC is much more difficult to trellis than CABAC. * * CABAC has only three states to track: significance map, last, and the * level state machine. * CAVLC, by comparison, has five: coeff_token (trailing + total), * total_zeroes, zero_run, and the level state machine. * * I know of no paper that has managed to design a close-to-optimal trellis * that covers all five of these and isn't exponential-time. As a result, this * "trellis" isn't: it's just a QNS search. Patches welcome for something better. * It's actually surprisingly fast, albeit not quite optimal. It's pretty close * though; since CAVLC only has 2^16 possible rounding modes (assuming only two * roundings as options), a bruteforce search is feasible. Testing shows * that this QNS is reasonably close to optimal in terms of compression. * * TODO: * Don't bother changing large coefficients when it wouldn't affect bit cost * (e.g. only affecting bypassed suffix bits). * Don't re-run all parts of CAVLC bit cost calculation when not necessary. * e.g. when changing a coefficient from one non-zero value to another in * such a way that trailing ones and suffix length isn't affected. */ static ALWAYS_INLINE int quant_trellis_cavlc( x264_t *h, dctcoef *dct, const udctcoef *quant_mf, const int *unquant_mf, const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx, int b_8x8 ) { ALIGNED_ARRAY_16( dctcoef, quant_coefs,[2],[16] ); ALIGNED_ARRAY_16( dctcoef, coefs,[16] ); const uint32_t *coef_weight1 = b_8x8 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = b_8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; int64_t delta_distortion[16]; int64_t score = 1ULL<<62; int i, j; const int f = 1<<15; int nC = b_chroma && dc ? 3 + (num_coefs>>2) : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )]; for( i = 0; i < 16; i += 16/sizeof(*coefs) ) M128( &coefs[i] ) = M128_ZERO; /* Code for handling 8x8dct -> 4x4dct CAVLC munging. Input/output use a different * step/start/end than internal processing. */ int step = 1; int start = b_ac; int end = num_coefs - 1; if( b_8x8 ) { start = idx&3; end = 60 + start; step = 4; } idx &= 15; lambda2 <<= LAMBDA_BITS; /* Find last non-zero coefficient. */ for( i = end; i >= start; i -= step ) if( abs(dct[zigzag[i]]) * (dc?quant_mf[0]>>1:quant_mf[zigzag[i]]) >= f ) break; if( i < start ) goto zeroblock; /* Prepare for QNS search: calculate distortion caused by each DCT coefficient * rounding to be searched. * * We only search two roundings (nearest and nearest-1) like in CABAC trellis, * so we just store the difference in distortion between them. */ int last_nnz = b_8x8 ? i >> 2 : i; int coef_mask = 0; int round_mask = 0; for( i = b_ac, j = start; i <= last_nnz; i++, j += step ) { int coef = dct[zigzag[j]]; int abs_coef = abs(coef); int sign = coef < 0 ? -1 : 1; int nearest_quant = ( f + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16; quant_coefs[1][i] = quant_coefs[0][i] = sign * nearest_quant; coefs[i] = quant_coefs[1][i]; if( nearest_quant ) { /* We initialize the trellis with a deadzone halfway between nearest rounding * and always-round-down. This gives much better results than initializing to either * extreme. * FIXME: should we initialize to the deadzones used by deadzone quant? */ int deadzone_quant = ( f/2 + abs_coef * (dc?quant_mf[0]>>1:quant_mf[zigzag[j]]) ) >> 16; int unquant1 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-0) + 128) >> 8); int unquant0 = (((dc?unquant_mf[0]<<1:unquant_mf[zigzag[j]]) * (nearest_quant-1) + 128) >> 8); int d1 = abs_coef - unquant1; int d0 = abs_coef - unquant0; delta_distortion[i] = (int64_t)(d0*d0 - d1*d1) * (dc?256:coef_weight2[zigzag[j]]); /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */ if( h->mb.i_psy_trellis && j && !dc && !b_chroma ) { int orig_coef = b_8x8 ? h->mb.pic.fenc_dct8[idx>>2][zigzag[j]] : h->mb.pic.fenc_dct4[idx][zigzag[j]]; int predicted_coef = orig_coef - coef; int psy_weight = coef_weight1[zigzag[j]]; int psy_value0 = h->mb.i_psy_trellis * abs(predicted_coef + unquant0 * sign); int psy_value1 = h->mb.i_psy_trellis * abs(predicted_coef + unquant1 * sign); delta_distortion[i] += (psy_value0 - psy_value1) * psy_weight; } quant_coefs[0][i] = sign * (nearest_quant-1); if( deadzone_quant != nearest_quant ) coefs[i] = quant_coefs[0][i]; else round_mask |= 1 << i; } else delta_distortion[i] = 0; coef_mask |= (!!coefs[i]) << i; } /* Calculate the cost of the starting state. */ h->out.bs.i_bits_encoded = 0; if( !coef_mask ) bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] ); else cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC ); score = (int64_t)h->out.bs.i_bits_encoded * lambda2; /* QNS loop: pick the change that improves RD the most, apply it, repeat. * coef_mask and round_mask are used to simplify tracking of nonzeroness * and rounding modes chosen. */ while( 1 ) { int64_t iter_score = score; int64_t iter_distortion_delta = 0; int iter_coef = -1; int iter_mask = coef_mask; int iter_round = round_mask; for( i = b_ac; i <= last_nnz; i++ ) { if( !delta_distortion[i] ) continue; /* Set up all the variables for this iteration. */ int cur_round = round_mask ^ (1 << i); int round_change = (cur_round >> i)&1; int old_coef = coefs[i]; int new_coef = quant_coefs[round_change][i]; int cur_mask = (coef_mask&~(1 << i))|(!!new_coef << i); int64_t cur_distortion_delta = delta_distortion[i] * (round_change ? -1 : 1); int64_t cur_score = cur_distortion_delta; coefs[i] = new_coef; /* Count up bits. */ h->out.bs.i_bits_encoded = 0; if( !cur_mask ) bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] ); else cavlc_block_residual_internal( h, ctx_block_cat, coefs + b_ac, nC ); cur_score += (int64_t)h->out.bs.i_bits_encoded * lambda2; coefs[i] = old_coef; if( cur_score < iter_score ) { iter_score = cur_score; iter_coef = i; iter_mask = cur_mask; iter_round = cur_round; iter_distortion_delta = cur_distortion_delta; } } if( iter_coef >= 0 ) { score = iter_score - iter_distortion_delta; coef_mask = iter_mask; round_mask = iter_round; coefs[iter_coef] = quant_coefs[((round_mask >> iter_coef)&1)][iter_coef]; /* Don't try adjusting coefficients we've already adjusted. * Testing suggests this doesn't hurt results -- and sometimes actually helps. */ delta_distortion[iter_coef] = 0; } else break; } if( coef_mask ) { for( i = b_ac, j = start; i < num_coefs; i++, j += step ) dct[zigzag[j]] = coefs[i]; return 1; } zeroblock: if( !dc ) { if( b_8x8 ) for( i = start; i <= end; i+=step ) dct[zigzag[i]] = 0; else memset( dct, 0, 16*sizeof(dctcoef) ); } return 0; } int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx ) { if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 ); } static const uint8_t zigzag_scan2x2[4] = { 0, 1, 2, 3 }; static const uint8_t zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 }; int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx ) { const uint8_t *zigzag; int num_coefs; int quant_cat = CQM_4IC+1 - b_intra; if( CHROMA_FORMAT == CHROMA_422 ) { zigzag = zigzag_scan2x4; num_coefs = 8; } else { zigzag = zigzag_scan2x2; num_coefs = 4; } if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[quant_cat][i_qp], h->quant4_bias0[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], zigzag, DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx ); return quant_trellis_cavlc( h, dct, h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], zigzag, DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 ); } int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ) { static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0}; int b_ac = ctx_ac[ctx_block_cat]; if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias0[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx ); return quant_trellis_cavlc( h, dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_zigzag_scan4[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], b_ac, b_chroma, 0, 16, idx, 0 ); } int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ) { if( h->param.b_cabac ) { return quant_trellis_cabac( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias0[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_zigzag_scan8[MB_INTERLACED], ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 64, idx ); } /* 8x8 CAVLC is split into 4 4x4 blocks */ int nzaccum = 0; for( int i = 0; i < 4; i++ ) { int nz = quant_trellis_cavlc( h, dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_zigzag_scan8[MB_INTERLACED], DCT_LUMA_4x4, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 0, 16, idx*4+i, 1 ); /* Set up nonzero count for future calls */ h->mb.cache.non_zero_count[x264_scan8[idx*4+i]] = nz; nzaccum |= nz; } STORE_8x8_NNZ( 0, idx, 0 ); return nzaccum; } x264-master/encoder/set.c000066400000000000000000001046331502133446700154440ustar00rootroot00000000000000/***************************************************************************** * set: header writing ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "set.h" #define bs_write_ue bs_write_ue_big // Indexed by pic_struct values static const uint8_t num_clock_ts[10] = { 0, 1, 1, 1, 2, 2, 3, 3, 2, 3 }; static const uint8_t avcintra_uuid[] = {0xF7, 0x49, 0x3E, 0xB3, 0xD4, 0x00, 0x47, 0x96, 0x86, 0x86, 0xC9, 0x70, 0x7B, 0x64, 0x37, 0x2A}; static void transpose( uint8_t *buf, int w ) { for( int i = 0; i < w; i++ ) for( int j = 0; j < i; j++ ) XCHG( uint8_t, buf[w*i+j], buf[w*j+i] ); } static void scaling_list_write( bs_t *s, x264_sps_t *sps, int idx ) { const int len = idx<4 ? 16 : 64; const uint8_t *zigzag = idx<4 ? x264_zigzag_scan4[0] : x264_zigzag_scan8[0]; const uint8_t *list = sps->scaling_list[idx]; const uint8_t *def_list = (idx==CQM_4IC) ? sps->scaling_list[CQM_4IY] : (idx==CQM_4PC) ? sps->scaling_list[CQM_4PY] : (idx==CQM_8IC+4) ? sps->scaling_list[CQM_8IY+4] : (idx==CQM_8PC+4) ? sps->scaling_list[CQM_8PY+4] : x264_cqm_jvt[idx]; if( !memcmp( list, def_list, len ) ) bs_write1( s, 0 ); // scaling_list_present_flag else if( !memcmp( list, x264_cqm_jvt[idx], len ) ) { bs_write1( s, 1 ); // scaling_list_present_flag bs_write_se( s, -8 ); // use jvt list } else { int run; bs_write1( s, 1 ); // scaling_list_present_flag // try run-length compression of trailing values for( run = len; run > 1; run-- ) if( list[zigzag[run-1]] != list[zigzag[run-2]] ) break; if( run < len && len - run < bs_size_se( (int8_t)-list[zigzag[run]] ) ) run = len; for( int j = 0; j < run; j++ ) bs_write_se( s, (int8_t)(list[zigzag[j]] - (j>0 ? list[zigzag[j-1]] : 8)) ); // delta if( run < len ) bs_write_se( s, (int8_t)-list[zigzag[run]] ); } } void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type ) { int i; bs_realign( s ); for( i = 0; i <= payload_type-255; i += 255 ) bs_write( s, 8, 255 ); bs_write( s, 8, payload_type-i ); for( i = 0; i <= payload_size-255; i += 255 ) bs_write( s, 8, 255 ); bs_write( s, 8, payload_size-i ); for( i = 0; i < payload_size; i++ ) bs_write( s, 8, payload[i] ); bs_rbsp_trailing( s ); bs_flush( s ); } void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param ) { int csp = param->i_csp & X264_CSP_MASK; sps->i_id = i_id; sps->i_mb_width = ( param->i_width + 15 ) / 16; sps->i_mb_height= ( param->i_height + 15 ) / 16; sps->b_frame_mbs_only = !(param->b_interlaced || param->b_fake_interlaced); if( !sps->b_frame_mbs_only ) sps->i_mb_height = ( sps->i_mb_height + 1 ) & ~1; sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 : csp >= X264_CSP_I422 ? CHROMA_422 : csp >= X264_CSP_I420 ? CHROMA_420 : CHROMA_400; sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0; if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 ) sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE; else if( sps->i_chroma_format_idc == CHROMA_422 ) sps->i_profile_idc = PROFILE_HIGH422; else if( BIT_DEPTH > 8 ) sps->i_profile_idc = PROFILE_HIGH10; else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT || sps->i_chroma_format_idc == CHROMA_400 ) sps->i_profile_idc = PROFILE_HIGH; else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 ) sps->i_profile_idc = PROFILE_MAIN; else sps->i_profile_idc = PROFILE_BASELINE; sps->b_constraint_set0 = sps->i_profile_idc == PROFILE_BASELINE; /* x264 doesn't support the features that are in Baseline and not in Main, * namely arbitrary_slice_order and slice_groups. */ sps->b_constraint_set1 = sps->i_profile_idc <= PROFILE_MAIN; /* Never set constraint_set2, it is not necessary and not used in real world. */ sps->b_constraint_set2 = 0; sps->b_constraint_set3 = 0; sps->i_level_idc = param->i_level_idc; if( param->i_level_idc == 9 && ( sps->i_profile_idc == PROFILE_BASELINE || sps->i_profile_idc == PROFILE_MAIN ) ) { sps->b_constraint_set3 = 1; /* level 1b with Baseline or Main profile is signalled via constraint_set3 */ sps->i_level_idc = 11; } /* Intra profiles */ if( param->i_keyint_max == 1 && sps->i_profile_idc >= PROFILE_HIGH ) sps->b_constraint_set3 = 1; sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0; /* extra slot with pyramid so that we don't have to override the * order of forgetting old pictures */ sps->vui.i_max_dec_frame_buffering = sps->i_num_ref_frames = X264_MIN(X264_REF_MAX, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames, param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size)); sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT; if( param->i_keyint_max == 1 ) { sps->i_num_ref_frames = 0; sps->vui.i_max_dec_frame_buffering = 0; } /* number of refs + current frame */ int max_frame_num = sps->vui.i_max_dec_frame_buffering * (!!param->i_bframe_pyramid+1) + 1; /* Intra refresh cannot write a recovery time greater than max frame num-1 */ if( param->b_intra_refresh ) { int time_to_recovery = X264_MIN( sps->i_mb_width - 1, param->i_keyint_max ) + param->i_bframe - 1; max_frame_num = X264_MAX( max_frame_num, time_to_recovery+1 ); } sps->i_log2_max_frame_num = 4; while( (1 << sps->i_log2_max_frame_num) <= max_frame_num ) sps->i_log2_max_frame_num++; sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2; if( sps->i_poc_type == 0 ) { int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2; sps->i_log2_max_poc_lsb = 4; while( (1 << sps->i_log2_max_poc_lsb) <= max_delta_poc * 2 ) sps->i_log2_max_poc_lsb++; } sps->b_vui = 1; sps->b_gaps_in_frame_num_value_allowed = 0; sps->b_mb_adaptive_frame_field = param->b_interlaced; sps->b_direct8x8_inference = 1; x264_sps_init_reconfigurable( sps, param ); sps->vui.b_overscan_info_present = param->vui.i_overscan > 0 && param->vui.i_overscan <= 2; if( sps->vui.b_overscan_info_present ) sps->vui.b_overscan_info = ( param->vui.i_overscan == 2 ? 1 : 0 ); sps->vui.b_signal_type_present = 0; sps->vui.i_vidformat = ( param->vui.i_vidformat >= 0 && param->vui.i_vidformat <= 5 ? param->vui.i_vidformat : 5 ); sps->vui.b_fullrange = ( param->vui.b_fullrange >= 0 && param->vui.b_fullrange <= 1 ? param->vui.b_fullrange : ( csp >= X264_CSP_BGR ? 1 : 0 ) ); sps->vui.b_color_description_present = 0; sps->vui.i_colorprim = ( param->vui.i_colorprim >= 0 && param->vui.i_colorprim <= 12 ? param->vui.i_colorprim : 2 ); sps->vui.i_transfer = ( param->vui.i_transfer >= 0 && param->vui.i_transfer <= 18 ? param->vui.i_transfer : 2 ); sps->vui.i_colmatrix = ( param->vui.i_colmatrix >= 0 && param->vui.i_colmatrix <= 14 ? param->vui.i_colmatrix : ( csp >= X264_CSP_BGR ? 0 : 2 ) ); if( sps->vui.i_colorprim != 2 || sps->vui.i_transfer != 2 || sps->vui.i_colmatrix != 2 ) sps->vui.b_color_description_present = 1; if( sps->vui.i_vidformat != 5 || sps->vui.b_fullrange || sps->vui.b_color_description_present ) sps->vui.b_signal_type_present = 1; /* FIXME: not sufficient for interlaced video */ sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5 && sps->i_chroma_format_idc == CHROMA_420; if( sps->vui.b_chroma_loc_info_present ) { sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc; sps->vui.i_chroma_loc_bottom = param->vui.i_chroma_loc; } sps->vui.b_timing_info_present = param->i_timebase_num > 0 && param->i_timebase_den > 0; if( sps->vui.b_timing_info_present ) { sps->vui.i_num_units_in_tick = param->i_timebase_num; sps->vui.i_time_scale = param->i_timebase_den * 2; sps->vui.b_fixed_frame_rate = !param->b_vfr_input; } sps->vui.b_vcl_hrd_parameters_present = 0; // we don't support VCL HRD sps->vui.b_nal_hrd_parameters_present = !!param->i_nal_hrd; sps->vui.b_pic_struct_present = param->b_pic_struct; // NOTE: HRD related parts of the SPS are initialised in x264_ratecontrol_init_reconfigurable sps->vui.b_bitstream_restriction = !(sps->b_constraint_set3 && sps->i_profile_idc >= PROFILE_HIGH); if( sps->vui.b_bitstream_restriction ) { sps->vui.b_motion_vectors_over_pic_boundaries = 1; sps->vui.i_max_bytes_per_pic_denom = 0; sps->vui.i_max_bits_per_mb_denom = 0; sps->vui.i_log2_max_mv_length_horizontal = sps->vui.i_log2_max_mv_length_vertical = (int)log2f( X264_MAX( 1, param->analyse.i_mv_range*4-1 ) ) + 1; } sps->b_avcintra_hd = param->i_avcintra_class && param->i_avcintra_class <= 200; sps->b_avcintra_4k = param->i_avcintra_class > 200; sps->i_cqm_preset = param->i_cqm_preset; } void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param ) { sps->crop.i_left = param->crop_rect.i_left; sps->crop.i_top = param->crop_rect.i_top; sps->crop.i_right = param->crop_rect.i_right + sps->i_mb_width*16 - param->i_width; sps->crop.i_bottom = param->crop_rect.i_bottom + sps->i_mb_height*16 - param->i_height; sps->b_crop = sps->crop.i_left || sps->crop.i_top || sps->crop.i_right || sps->crop.i_bottom; sps->vui.b_aspect_ratio_info_present = 0; if( param->vui.i_sar_width > 0 && param->vui.i_sar_height > 0 ) { sps->vui.b_aspect_ratio_info_present = 1; sps->vui.i_sar_width = param->vui.i_sar_width; sps->vui.i_sar_height= param->vui.i_sar_height; } } void x264_sps_init_scaling_list( x264_sps_t *sps, x264_param_t *param ) { switch( sps->i_cqm_preset ) { case X264_CQM_FLAT: for( int i = 0; i < 8; i++ ) sps->scaling_list[i] = x264_cqm_flat16; break; case X264_CQM_JVT: for( int i = 0; i < 8; i++ ) sps->scaling_list[i] = x264_cqm_jvt[i]; break; case X264_CQM_CUSTOM: /* match the transposed DCT & zigzag */ transpose( param->cqm_4iy, 4 ); transpose( param->cqm_4py, 4 ); transpose( param->cqm_4ic, 4 ); transpose( param->cqm_4pc, 4 ); transpose( param->cqm_8iy, 8 ); transpose( param->cqm_8py, 8 ); transpose( param->cqm_8ic, 8 ); transpose( param->cqm_8pc, 8 ); sps->scaling_list[CQM_4IY] = param->cqm_4iy; sps->scaling_list[CQM_4PY] = param->cqm_4py; sps->scaling_list[CQM_4IC] = param->cqm_4ic; sps->scaling_list[CQM_4PC] = param->cqm_4pc; sps->scaling_list[CQM_8IY+4] = param->cqm_8iy; sps->scaling_list[CQM_8PY+4] = param->cqm_8py; sps->scaling_list[CQM_8IC+4] = param->cqm_8ic; sps->scaling_list[CQM_8PC+4] = param->cqm_8pc; for( int i = 0; i < 8; i++ ) for( int j = 0; j < (i < 4 ? 16 : 64); j++ ) if( sps->scaling_list[i][j] == 0 ) sps->scaling_list[i] = x264_cqm_jvt[i]; break; } } void x264_sps_write( bs_t *s, x264_sps_t *sps ) { bs_realign( s ); bs_write( s, 8, sps->i_profile_idc ); bs_write1( s, sps->b_constraint_set0 ); bs_write1( s, sps->b_constraint_set1 ); bs_write1( s, sps->b_constraint_set2 ); bs_write1( s, sps->b_constraint_set3 ); bs_write( s, 4, 0 ); /* reserved */ bs_write( s, 8, sps->i_level_idc ); bs_write_ue( s, sps->i_id ); if( sps->i_profile_idc >= PROFILE_HIGH ) { bs_write_ue( s, sps->i_chroma_format_idc ); if( sps->i_chroma_format_idc == CHROMA_444 ) bs_write1( s, 0 ); // separate_colour_plane_flag bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8 bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8 bs_write1( s, sps->b_qpprime_y_zero_transform_bypass ); /* Exactly match the AVC-Intra bitstream */ bs_write1( s, sps->b_avcintra_hd ); // seq_scaling_matrix_present_flag if( sps->b_avcintra_hd ) { scaling_list_write( s, sps, CQM_4IY ); scaling_list_write( s, sps, CQM_4IC ); scaling_list_write( s, sps, CQM_4IC ); bs_write1( s, 0 ); // no inter bs_write1( s, 0 ); // no inter bs_write1( s, 0 ); // no inter scaling_list_write( s, sps, CQM_8IY+4 ); bs_write1( s, 0 ); // no inter if( sps->i_chroma_format_idc == CHROMA_444 ) { scaling_list_write( s, sps, CQM_8IC+4 ); bs_write1( s, 0 ); // no inter scaling_list_write( s, sps, CQM_8IC+4 ); bs_write1( s, 0 ); // no inter } } } bs_write_ue( s, sps->i_log2_max_frame_num - 4 ); bs_write_ue( s, sps->i_poc_type ); if( sps->i_poc_type == 0 ) bs_write_ue( s, sps->i_log2_max_poc_lsb - 4 ); bs_write_ue( s, sps->i_num_ref_frames ); bs_write1( s, sps->b_gaps_in_frame_num_value_allowed ); bs_write_ue( s, sps->i_mb_width - 1 ); bs_write_ue( s, (sps->i_mb_height >> !sps->b_frame_mbs_only) - 1); bs_write1( s, sps->b_frame_mbs_only ); if( !sps->b_frame_mbs_only ) bs_write1( s, sps->b_mb_adaptive_frame_field ); bs_write1( s, sps->b_direct8x8_inference ); bs_write1( s, sps->b_crop ); if( sps->b_crop ) { int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422; int v_shift = (sps->i_chroma_format_idc == CHROMA_420) + !sps->b_frame_mbs_only; bs_write_ue( s, sps->crop.i_left >> h_shift ); bs_write_ue( s, sps->crop.i_right >> h_shift ); bs_write_ue( s, sps->crop.i_top >> v_shift ); bs_write_ue( s, sps->crop.i_bottom >> v_shift ); } bs_write1( s, sps->b_vui ); if( sps->b_vui ) { bs_write1( s, sps->vui.b_aspect_ratio_info_present ); if( sps->vui.b_aspect_ratio_info_present ) { int i; static const struct { uint8_t w, h, sar; } sar[] = { // aspect_ratio_idc = 0 -> unspecified { 1, 1, 1 }, { 12, 11, 2 }, { 10, 11, 3 }, { 16, 11, 4 }, { 40, 33, 5 }, { 24, 11, 6 }, { 20, 11, 7 }, { 32, 11, 8 }, { 80, 33, 9 }, { 18, 11, 10}, { 15, 11, 11}, { 64, 33, 12}, {160, 99, 13}, { 4, 3, 14}, { 3, 2, 15}, { 2, 1, 16}, // aspect_ratio_idc = [17..254] -> reserved { 0, 0, 255 } }; for( i = 0; sar[i].sar != 255; i++ ) { if( sar[i].w == sps->vui.i_sar_width && sar[i].h == sps->vui.i_sar_height ) break; } bs_write( s, 8, sar[i].sar ); if( sar[i].sar == 255 ) /* aspect_ratio_idc (extended) */ { bs_write( s, 16, sps->vui.i_sar_width ); bs_write( s, 16, sps->vui.i_sar_height ); } } bs_write1( s, sps->vui.b_overscan_info_present ); if( sps->vui.b_overscan_info_present ) bs_write1( s, sps->vui.b_overscan_info ); bs_write1( s, sps->vui.b_signal_type_present ); if( sps->vui.b_signal_type_present ) { bs_write( s, 3, sps->vui.i_vidformat ); bs_write1( s, sps->vui.b_fullrange ); bs_write1( s, sps->vui.b_color_description_present ); if( sps->vui.b_color_description_present ) { bs_write( s, 8, sps->vui.i_colorprim ); bs_write( s, 8, sps->vui.i_transfer ); bs_write( s, 8, sps->vui.i_colmatrix ); } } bs_write1( s, sps->vui.b_chroma_loc_info_present ); if( sps->vui.b_chroma_loc_info_present ) { bs_write_ue( s, sps->vui.i_chroma_loc_top ); bs_write_ue( s, sps->vui.i_chroma_loc_bottom ); } bs_write1( s, sps->vui.b_timing_info_present ); if( sps->vui.b_timing_info_present ) { bs_write32( s, sps->vui.i_num_units_in_tick ); bs_write32( s, sps->vui.i_time_scale ); bs_write1( s, sps->vui.b_fixed_frame_rate ); } bs_write1( s, sps->vui.b_nal_hrd_parameters_present ); if( sps->vui.b_nal_hrd_parameters_present ) { bs_write_ue( s, sps->vui.hrd.i_cpb_cnt - 1 ); bs_write( s, 4, sps->vui.hrd.i_bit_rate_scale ); bs_write( s, 4, sps->vui.hrd.i_cpb_size_scale ); bs_write_ue( s, sps->vui.hrd.i_bit_rate_value - 1 ); bs_write_ue( s, sps->vui.hrd.i_cpb_size_value - 1 ); bs_write1( s, sps->vui.hrd.b_cbr_hrd ); bs_write( s, 5, sps->vui.hrd.i_initial_cpb_removal_delay_length - 1 ); bs_write( s, 5, sps->vui.hrd.i_cpb_removal_delay_length - 1 ); bs_write( s, 5, sps->vui.hrd.i_dpb_output_delay_length - 1 ); bs_write( s, 5, sps->vui.hrd.i_time_offset_length ); } bs_write1( s, sps->vui.b_vcl_hrd_parameters_present ); if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present ) bs_write1( s, 0 ); /* low_delay_hrd_flag */ bs_write1( s, sps->vui.b_pic_struct_present ); bs_write1( s, sps->vui.b_bitstream_restriction ); if( sps->vui.b_bitstream_restriction ) { bs_write1( s, sps->vui.b_motion_vectors_over_pic_boundaries ); bs_write_ue( s, sps->vui.i_max_bytes_per_pic_denom ); bs_write_ue( s, sps->vui.i_max_bits_per_mb_denom ); bs_write_ue( s, sps->vui.i_log2_max_mv_length_horizontal ); bs_write_ue( s, sps->vui.i_log2_max_mv_length_vertical ); bs_write_ue( s, sps->vui.i_num_reorder_frames ); bs_write_ue( s, sps->vui.i_max_dec_frame_buffering ); } } bs_rbsp_trailing( s ); bs_flush( s ); } void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps ) { pps->i_id = i_id; pps->i_sps_id = sps->i_id; pps->b_cabac = param->b_cabac; pps->b_pic_order = !param->i_avcintra_class && param->b_interlaced; pps->i_num_slice_groups = 1; pps->i_num_ref_idx_l0_default_active = param->i_frame_reference; pps->i_num_ref_idx_l1_default_active = 1; pps->b_weighted_pred = param->analyse.i_weighted_pred > 0; pps->b_weighted_bipred = param->analyse.b_weighted_bipred ? 2 : 0; pps->i_pic_init_qp = param->rc.i_rc_method == X264_RC_ABR || param->b_stitchable ? 26 + QP_BD_OFFSET : SPEC_QP( param->rc.i_qp_constant ); pps->i_pic_init_qs = 26 + QP_BD_OFFSET; pps->i_chroma_qp_index_offset = param->analyse.i_chroma_qp_offset; pps->b_deblocking_filter_control = 1; pps->b_constrained_intra_pred = param->b_constrained_intra; pps->b_redundant_pic_cnt = 0; pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0; } void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps ) { bs_realign( s ); bs_write_ue( s, pps->i_id ); bs_write_ue( s, pps->i_sps_id ); bs_write1( s, pps->b_cabac ); bs_write1( s, pps->b_pic_order ); bs_write_ue( s, pps->i_num_slice_groups - 1 ); bs_write_ue( s, pps->i_num_ref_idx_l0_default_active - 1 ); bs_write_ue( s, pps->i_num_ref_idx_l1_default_active - 1 ); bs_write1( s, pps->b_weighted_pred ); bs_write( s, 2, pps->b_weighted_bipred ); bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET ); bs_write_se( s, pps->i_pic_init_qs - 26 - QP_BD_OFFSET ); bs_write_se( s, pps->i_chroma_qp_index_offset ); bs_write1( s, pps->b_deblocking_filter_control ); bs_write1( s, pps->b_constrained_intra_pred ); bs_write1( s, pps->b_redundant_pic_cnt ); int b_scaling_list = !sps->b_avcintra_hd && sps->i_cqm_preset != X264_CQM_FLAT; if( pps->b_transform_8x8_mode || b_scaling_list ) { bs_write1( s, pps->b_transform_8x8_mode ); bs_write1( s, b_scaling_list ); if( b_scaling_list ) { scaling_list_write( s, sps, CQM_4IY ); scaling_list_write( s, sps, CQM_4IC ); if( sps->b_avcintra_4k ) { scaling_list_write( s, sps, CQM_4IC ); bs_write1( s, 0 ); // no inter bs_write1( s, 0 ); // no inter bs_write1( s, 0 ); // no inter } else { bs_write1( s, 0 ); // Cr = Cb scaling_list_write( s, sps, CQM_4PY ); scaling_list_write( s, sps, CQM_4PC ); bs_write1( s, 0 ); // Cr = Cb } if( pps->b_transform_8x8_mode ) { scaling_list_write( s, sps, CQM_8IY+4 ); if( sps->b_avcintra_4k ) bs_write1( s, 0 ); // no inter else scaling_list_write( s, sps, CQM_8PY+4 ); if( sps->i_chroma_format_idc == CHROMA_444 ) { scaling_list_write( s, sps, CQM_8IC+4 ); scaling_list_write( s, sps, CQM_8PC+4 ); bs_write1( s, 0 ); // Cr = Cb bs_write1( s, 0 ); // Cr = Cb } } } bs_write_se( s, pps->i_chroma_qp_index_offset ); } bs_rbsp_trailing( s ); bs_flush( s ); } void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt ) { bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); bs_write_ue( &q, recovery_frame_cnt ); // recovery_frame_cnt bs_write1( &q, 1 ); //exact_match_flag 1 bs_write1( &q, 0 ); //broken_link_flag 0 bs_write( &q, 2, 0 ); //changing_slice_group 0 bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_RECOVERY_POINT ); } int x264_sei_version_write( x264_t *h, bs_t *s ) { // random ID number generated according to ISO-11578 static const uint8_t uuid[16] = { 0xdc, 0x45, 0xe9, 0xbd, 0xe6, 0xd9, 0x48, 0xb7, 0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef }; char *opts = x264_param2string( &h->param, 0 ); char *payload; int length; if( !opts ) return -1; CHECKED_MALLOC( payload, 200 + strlen( opts ) ); memcpy( payload, uuid, 16 ); sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - " "Copy%s 2003-2025 - http://www.videolan.org/x264.html - options: %s", X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts ); length = strlen(payload)+1; x264_sei_write( s, (uint8_t *)payload, length, SEI_USER_DATA_UNREGISTERED ); x264_free( opts ); x264_free( payload ); return 0; fail: x264_free( opts ); return -1; } void x264_sei_buffering_period_write( x264_t *h, bs_t *s ) { x264_sps_t *sps = h->sps; bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); bs_write_ue( &q, sps->i_id ); if( sps->vui.b_nal_hrd_parameters_present ) { bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay ); bs_write( &q, sps->vui.hrd.i_initial_cpb_removal_delay_length, h->initial_cpb_removal_delay_offset ); } bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_BUFFERING_PERIOD ); } void x264_sei_pic_timing_write( x264_t *h, bs_t *s ) { x264_sps_t *sps = h->sps; bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); if( sps->vui.b_nal_hrd_parameters_present || sps->vui.b_vcl_hrd_parameters_present ) { bs_write( &q, sps->vui.hrd.i_cpb_removal_delay_length, h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset ); bs_write( &q, sps->vui.hrd.i_dpb_output_delay_length, h->fenc->i_dpb_output_delay ); } if( sps->vui.b_pic_struct_present ) { bs_write( &q, 4, h->fenc->i_pic_struct-1 ); // We use index 0 for "Auto" // These clock timestamps are not standardised so we don't set them // They could be time of origin, capture or alternative ideal display for( int i = 0; i < num_clock_ts[h->fenc->i_pic_struct]; i++ ) bs_write1( &q, 0 ); // clock_timestamp_flag } bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_PIC_TIMING ); } void x264_sei_frame_packing_write( x264_t *h, bs_t *s ) { int quincunx_sampling_flag = h->param.i_frame_packing == 0; bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); bs_write_ue( &q, 0 ); // frame_packing_arrangement_id bs_write1( &q, 0 ); // frame_packing_arrangement_cancel_flag bs_write ( &q, 7, h->param.i_frame_packing ); // frame_packing_arrangement_type bs_write1( &q, quincunx_sampling_flag ); // quincunx_sampling_flag // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type bs_write1( &q, 0 ); // spatial_flipping_flag bs_write1( &q, 0 ); // frame0_flipped_flag bs_write1( &q, 0 ); // field_views_flag bs_write1( &q, h->param.i_frame_packing == 5 && !(h->fenc->i_frame&1) ); // current_frame_is_frame0_flag bs_write1( &q, 0 ); // frame0_self_contained_flag bs_write1( &q, 0 ); // frame1_self_contained_flag if( quincunx_sampling_flag == 0 && h->param.i_frame_packing != 5 ) { bs_write( &q, 4, 0 ); // frame0_grid_position_x bs_write( &q, 4, 0 ); // frame0_grid_position_y bs_write( &q, 4, 0 ); // frame1_grid_position_x bs_write( &q, 4, 0 ); // frame1_grid_position_y } bs_write( &q, 8, 0 ); // frame_packing_arrangement_reserved_byte // "frame_packing_arrangement_repetition_period equal to 1 specifies that the frame packing arrangement SEI message persists in output" // for (i_frame_packing == 5) this will undermine current_frame_is_frame0_flag which must alternate every view sequence bs_write_ue( &q, h->param.i_frame_packing != 5 ); // frame_packing_arrangement_repetition_period bs_write1( &q, 0 ); // frame_packing_arrangement_extension_flag bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_FRAME_PACKING ); } void x264_sei_mastering_display_write( x264_t *h, bs_t *s ) { bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); bs_write( &q, 16, h->param.mastering_display.i_green_x ); bs_write( &q, 16, h->param.mastering_display.i_green_y ); bs_write( &q, 16, h->param.mastering_display.i_blue_x ); bs_write( &q, 16, h->param.mastering_display.i_blue_y ); bs_write( &q, 16, h->param.mastering_display.i_red_x ); bs_write( &q, 16, h->param.mastering_display.i_red_y ); bs_write( &q, 16, h->param.mastering_display.i_white_x ); bs_write( &q, 16, h->param.mastering_display.i_white_y ); bs_write32( &q, h->param.mastering_display.i_display_max ); bs_write32( &q, h->param.mastering_display.i_display_min ); bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_MASTERING_DISPLAY ); } void x264_sei_content_light_level_write( x264_t *h, bs_t *s ) { bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); bs_write( &q, 16, h->param.content_light_level.i_max_cll ); bs_write( &q, 16, h->param.content_light_level.i_max_fall ); bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_CONTENT_LIGHT_LEVEL ); } void x264_sei_alternative_transfer_write( x264_t *h, bs_t *s ) { bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); bs_write ( &q, 8, h->param.i_alternative_transfer ); // preferred_transfer_characteristics bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_ALTERNATIVE_TRANSFER ); } void x264_filler_write( x264_t *h, bs_t *s, int filler ) { bs_realign( s ); for( int i = 0; i < filler; i++ ) bs_write( s, 8, 0xff ); bs_rbsp_trailing( s ); bs_flush( s ); } void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s ) { x264_slice_header_t *sh = &h->sh_backup; bs_t q; ALIGNED_4( uint8_t tmp_buf[100] ); M32( tmp_buf ) = 0; // shut up gcc bs_init( &q, tmp_buf, 100 ); bs_realign( &q ); /* We currently only use this for repeating B-refs, as required by Blu-ray. */ bs_write1( &q, 0 ); //original_idr_flag bs_write_ue( &q, sh->i_frame_num ); //original_frame_num if( !h->sps->b_frame_mbs_only ) bs_write1( &q, 0 ); //original_field_pic_flag bs_write1( &q, sh->i_mmco_command_count > 0 ); if( sh->i_mmco_command_count > 0 ) { for( int i = 0; i < sh->i_mmco_command_count; i++ ) { bs_write_ue( &q, 1 ); bs_write_ue( &q, sh->mmco[i].i_difference_of_pic_nums - 1 ); } bs_write_ue( &q, 0 ); } bs_align_10( &q ); x264_sei_write( s, tmp_buf, bs_pos( &q ) / 8, SEI_DEC_REF_PIC_MARKING ); } int x264_sei_avcintra_umid_write( x264_t *h, bs_t *s ) { uint8_t data[512]; const char *msg = "UMID"; const int len = 497; memset( data, 0xff, len ); memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) ); memcpy( data+16, msg, strlen(msg) ); data[20] = 0x13; /* These bytes appear to be some sort of frame/seconds counter in certain applications, * but others jump around, so leave them as zero for now */ data[22] = data[23] = data[25] = data[26] = 0; data[28] = 0x14; data[30] = data[31] = data[33] = data[34] = 0; data[36] = 0x60; data[41] = 0x22; /* Believed to be some sort of end of basic UMID identifier */ data[60] = 0x62; data[62] = data[63] = data[65] = data[66] = 0; data[68] = 0x63; data[70] = data[71] = data[73] = data[74] = 0; x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED ); return 0; } int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len ) { uint8_t data[6000]; const char *msg = "VANC"; if( len < 0 || (unsigned)len > sizeof(data) ) { x264_log( h, X264_LOG_ERROR, "AVC-Intra SEI is too large (%d)\n", len ); return -1; } memset( data, 0xff, len ); memcpy( data, avcintra_uuid, sizeof(avcintra_uuid) ); memcpy( data+16, msg, strlen(msg) ); x264_sei_write( &h->out.bs, data, len, SEI_USER_DATA_UNREGISTERED ); return 0; } #undef ERROR #define ERROR(...)\ {\ if( verbose )\ x264_log( h, X264_LOG_WARNING, __VA_ARGS__ );\ ret = 1;\ } int x264_validate_levels( x264_t *h, int verbose ) { int ret = 0; int mbs = h->sps->i_mb_width * h->sps->i_mb_height; int dpb = mbs * h->sps->vui.i_max_dec_frame_buffering; int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 : h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 : h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4; const x264_level_t *l = x264_levels; while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc ) l++; if( l->frame_size < mbs || l->frame_size*8 < h->sps->i_mb_width * h->sps->i_mb_width || l->frame_size*8 < h->sps->i_mb_height * h->sps->i_mb_height ) ERROR( "frame MB size (%dx%d) > level limit (%d)\n", h->sps->i_mb_width, h->sps->i_mb_height, l->frame_size ); if( dpb > l->dpb ) ERROR( "DPB size (%d frames, %d mbs) > level limit (%d frames, %d mbs)\n", h->sps->vui.i_max_dec_frame_buffering, dpb, l->dpb / mbs, l->dpb ); #define CHECK( name, limit, val ) \ if( (val) > (limit) ) \ ERROR( name " (%"PRId64") > level limit (%d)\n", (int64_t)(val), (limit) ); CHECK( "VBV bitrate", (l->bitrate * cbp_factor) / 4, h->param.rc.i_vbv_max_bitrate ); CHECK( "VBV buffer", (l->cpb * cbp_factor) / 4, h->param.rc.i_vbv_buffer_size ); CHECK( "MV range", l->mv_range, h->param.analyse.i_mv_range ); CHECK( "interlaced", !l->frame_only, h->param.b_interlaced ); CHECK( "fake interlaced", !l->frame_only, h->param.b_fake_interlaced ); if( h->param.i_fps_den > 0 ) CHECK( "MB rate", l->mbps, (int64_t)mbs * h->param.i_fps_num / h->param.i_fps_den ); /* TODO check the rest of the limits */ return ret; } x264-master/encoder/set.h000066400000000000000000000076221502133446700154510ustar00rootroot00000000000000/***************************************************************************** * set.h: header writing ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ENCODER_SET_H #define X264_ENCODER_SET_H #define x264_sps_init x264_template(sps_init) void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param ); #define x264_sps_init_reconfigurable x264_template(sps_init_reconfigurable) void x264_sps_init_reconfigurable( x264_sps_t *sps, x264_param_t *param ); #define x264_sps_init_scaling_list x264_template(sps_init_scaling_list) void x264_sps_init_scaling_list( x264_sps_t *sps, x264_param_t *param ); #define x264_sps_write x264_template(sps_write) void x264_sps_write( bs_t *s, x264_sps_t *sps ); #define x264_pps_init x264_template(pps_init) void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t *sps ); #define x264_pps_write x264_template(pps_write) void x264_pps_write( bs_t *s, x264_sps_t *sps, x264_pps_t *pps ); #define x264_sei_recovery_point_write x264_template(sei_recovery_point_write) void x264_sei_recovery_point_write( x264_t *h, bs_t *s, int recovery_frame_cnt ); #define x264_sei_version_write x264_template(sei_version_write) int x264_sei_version_write( x264_t *h, bs_t *s ); #define x264_validate_levels x264_template(validate_levels) int x264_validate_levels( x264_t *h, int verbose ); #define x264_sei_buffering_period_write x264_template(sei_buffering_period_write) void x264_sei_buffering_period_write( x264_t *h, bs_t *s ); #define x264_sei_pic_timing_write x264_template(sei_pic_timing_write) void x264_sei_pic_timing_write( x264_t *h, bs_t *s ); #define x264_sei_dec_ref_pic_marking_write x264_template(sei_dec_ref_pic_marking_write) void x264_sei_dec_ref_pic_marking_write( x264_t *h, bs_t *s ); #define x264_sei_frame_packing_write x264_template(sei_frame_packing_write) void x264_sei_frame_packing_write( x264_t *h, bs_t *s ); #define x264_sei_mastering_display_write x264_template(sei_mastering_display_write) void x264_sei_mastering_display_write( x264_t *h, bs_t *s ); #define x264_sei_content_light_level_write x264_template(sei_content_light_level_write) void x264_sei_content_light_level_write( x264_t *h, bs_t *s ); #define x264_sei_alternative_transfer_write x264_template(sei_alternative_transfer_write) void x264_sei_alternative_transfer_write( x264_t *h, bs_t *s ); #define x264_sei_avcintra_umid_write x264_template(sei_avcintra_umid_write) int x264_sei_avcintra_umid_write( x264_t *h, bs_t *s ); #define x264_sei_avcintra_vanc_write x264_template(sei_avcintra_vanc_write) int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len ); #define x264_sei_write x264_template(sei_write) void x264_sei_write( bs_t *s, uint8_t *payload, int payload_size, int payload_type ); #define x264_filler_write x264_template(filler_write) void x264_filler_write( x264_t *h, bs_t *s, int filler ); #endif x264-master/encoder/slicetype-cl.c000066400000000000000000001116021502133446700172400ustar00rootroot00000000000000/***************************************************************************** * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead) ***************************************************************************** * Copyright (C) 2012-2025 x264 project * * Authors: Steve Borho * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #include "me.h" #include "slicetype-cl.h" #if HAVE_OPENCL #ifdef _WIN32 #include #endif #define x264_weights_analyse x264_template(weights_analyse) void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined * in the OpenCL headers shipped with NVIDIA drivers. We need to be * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */ #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E #define OCLCHECK( method, ... )\ do\ {\ if( h->opencl.b_fatal_error )\ return -1;\ status = ocl->method( __VA_ARGS__ );\ if( status != CL_SUCCESS ) {\ h->param.b_opencl = 0;\ h->opencl.b_fatal_error = 1;\ x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\ return -1;\ }\ } while( 0 ) void x264_opencl_flush( x264_t *h ) { x264_opencl_function_t *ocl = h->opencl.ocl; ocl->clFinish( h->opencl.queue ); /* Finish copies from the GPU by copying from the page-locked buffer to * their final destination */ for( int i = 0; i < h->opencl.num_copies; i++ ) memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes ); h->opencl.num_copies = 0; h->opencl.pl_occupancy = 0; } static void *opencl_alloc_locked( x264_t *h, int bytes ) { if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE ) x264_opencl_flush( h ); assert( bytes < PAGE_LOCKED_BUF_SIZE ); char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy; h->opencl.pl_occupancy += bytes; return ptr; } int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ) { if( fenc->b_intra_calculated ) return 0; fenc->b_intra_calculated = 1; x264_opencl_function_t *ocl = h->opencl.ocl; int luma_length = fenc->i_stride[0] * fenc->i_lines[0]; #define CREATEBUF( out, flags, size )\ out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; } #define CREATEIMAGE( out, flags, pf, width, height )\ out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; } int mb_count = h->mb.i_mb_count; cl_int status; if( !h->opencl.lowres_mv_costs ) { /* Allocate shared memory buffers */ int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL; int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL; cl_image_format pixel_format; pixel_format.image_channel_order = CL_R; pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) { pixel_format.image_channel_order = CL_RGBA; pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height ); width >>= 1; height >>= 1; } CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) ); CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 ); CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) ); CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) ); CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length ); CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length ); } if( !fenc->opencl.intra_cost ) { /* Allocate per-frame buffers */ int width = h->mb.i_mb_width * 8 * SIZEOF_PIXEL; int height = h->mb.i_mb_height * 8 * SIZEOF_PIXEL; cl_image_format pixel_format; pixel_format.image_channel_order = CL_R; pixel_format.image_channel_data_type = CL_UNSIGNED_INT32; CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height ); for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) { pixel_format.image_channel_order = CL_RGBA; pixel_format.image_channel_data_type = CL_UNSIGNED_INT8; CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height ); width >>= 1; height >>= 1; } CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) ); CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) ); CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) ); CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) ); } #undef CREATEBUF #undef CREATEIMAGE /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */ char *locked = opencl_alloc_locked( h, luma_length ); memcpy( locked, fenc->plane[0], luma_length ); OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL ); size_t gdim[2]; if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor ) { int size = h->mb.i_mb_count * sizeof(int16_t); locked = opencl_alloc_locked( h, size ); memcpy( locked, fenc->i_inv_qscale_factor, size ); OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL ); } else { /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */ cl_uint arg = 0; int16_t value = 256; OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value ); gdim[0] = h->mb.i_mb_count; OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL ); } int stride = fenc->i_stride[0]; cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride ); gdim[0] = 8 * h->mb.i_mb_width; gdim[1] = 8 * h->mb.i_mb_height; OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL ); for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ ) { /* Workaround for AMD Southern Island: * * Alternate kernel instances. No perf impact to this, so we do it for * all GPUs. It prevents the same kernel from being enqueued * back-to-back, avoiding a dependency calculation bug in the driver. */ cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2; arg = 0; OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] ); OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] ); gdim[0] >>= 1; gdim[1] >>= 1; if( gdim[0] < 16 || gdim[1] < 16 ) break; OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL ); } size_t ldim[2]; gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5; gdim[1] = 8*h->mb.i_mb_height; ldim[0] = 32; ldim[1] = 8; arg = 0; /* For presets slow, slower, and placebo, check all 10 intra modes that the * C lookahead supports. For faster presets, only check the most frequent 8 * modes */ int slow = h->param.analyse.i_subpel_refine > 7; OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); gdim[0] = 256; gdim[1] = h->mb.i_mb_height; ldim[0] = 256; ldim[1] = 1; arg = 0; OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 ) x264_opencl_flush( h ); int size = h->mb.i_mb_count * sizeof(int16_t); locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = size; h->opencl.num_copies++; size = h->mb.i_mb_height * sizeof(int); locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = size; h->opencl.num_copies++; size = sizeof(int) * 4; locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0]; h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int); h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; h->opencl.last_buf = !h->opencl.last_buf; return 0; } /* This function was tested empirically on a number of AMD and NV GPUs. Making a * function which returns perfect launch dimensions is impossible; some * applications will have self-tuning code to try many possible variables and * measure the runtime. Here we simply make an educated guess based on what we * know GPUs typically prefer. */ static void optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device ) { x264_opencl_function_t *ocl = h->opencl.ocl; size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */ size_t preferred_multiple = 64; cl_uint num_cus = 6; ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL ); ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL ); ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL ); ldims[0] = preferred_multiple; ldims[1] = 8; /* make ldims[1] an even divisor of gdims[1] */ while( gdims[1] & (ldims[1] - 1) ) { ldims[0] <<= 1; ldims[1] >>= 1; } /* make total ldims fit under the max work-group dimensions for the device */ while( ldims[0] * ldims[1] > max_work_group ) { if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) ) ldims[1] >>= 1; else ldims[0] >>= 1; } if( ldims[0] > gdims[0] ) { /* remove preferred multiples until we're close to gdims[0] */ while( gdims[0] + preferred_multiple < ldims[0] ) ldims[0] -= preferred_multiple; gdims[0] = ldims[0]; } else { /* make gdims an even multiple of ldims */ gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0]; gdims[0] *= ldims[0]; } /* make ldims smaller to spread work across compute units */ while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus ) { if( ldims[0] > preferred_multiple ) ldims[0] >>= 1; else if( ldims[1] > 1 ) ldims[1] >>= 1; else break; } /* for smaller GPUs, try not to abuse their texture cache */ if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 ) ldims[0] = 32; } int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w ) { x264_opencl_function_t *ocl = h->opencl.ocl; x264_frame_t *fenc = frames[b]; x264_frame_t *fref = frames[ref]; cl_mem ref_scaled_images[NUM_IMAGE_SCALES]; cl_mem ref_luma_hpel; cl_int status; if( w && w->weightfn ) { size_t gdims[2]; gdims[0] = 8 * h->mb.i_mb_width; gdims[1] = 8 * h->mb.i_mb_height; /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */ for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) { cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale ); OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL ); gdims[0] >>= 1; gdims[1] >>= 1; if( gdims[0] < 16 || gdims[1] < 16 ) break; } cl_uint arg = 0; gdims[0] = 8 * h->mb.i_mb_width; gdims[1] = 8 * h->mb.i_mb_height; OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale ); OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL ); /* Use weighted reference planes for motion search */ for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) ref_scaled_images[i] = h->opencl.weighted_scaled_images[i]; ref_luma_hpel = h->opencl.weighted_luma_hpel; } else { /* Use unweighted reference planes for motion search */ for( int i = 0; i < NUM_IMAGE_SCALES; i++ ) ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i]; ref_luma_hpel = fref->opencl.luma_hpel; } const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 }; int b_first_iteration = 1; int b_reverse_references = 1; int A = 1; int mb_per_group = 0; int cost_local_size = 0; int mvc_local_size = 0; int mb_width; size_t gdims[2]; size_t ldims[2]; /* scale 0 is 8x8 */ for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- ) { mb_width = h->mb.i_mb_width >> scale; gdims[0] = mb_width; gdims[1] = h->mb.i_mb_height >> scale; if( gdims[0] < 2 || gdims[1] < 2 ) continue; gdims[0] <<= 2; optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device ); mb_per_group = (ldims[0] >> 2) * ldims[1]; cost_local_size = 4 * mb_per_group * sizeof(int16_t); mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2; int scaled_me_range = h->param.analyse.i_me_range >> scale; int b_shift_index = 1; cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references ); for( int iter = 0; iter < num_iterations[scale]; iter++ ) { OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); b_shift_index = 0; b_first_iteration = 0; /* alternate top-left vs bot-right MB references at lower scales, so * motion field smooths more quickly. */ if( scale > 2 ) b_reverse_references ^= 1; else b_reverse_references = 0; A = !A; OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration ); OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references ); } } int satd_local_size = mb_per_group * sizeof(uint32_t) * 16; cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL ); if( b_islist1 ) { OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 ); } else { OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 ); } OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref ); OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 ); if( h->opencl.b_device_AMD_SI ) { /* workaround for AMD Southern Island driver scheduling bug (fixed in * July 2012), perform meaningless small copy to add a data dependency */ OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL ); } OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count; if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 ) x264_opencl_flush( h ); char *locked = opencl_alloc_locked( h, mvlen ); h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].bytes = mvlen; if( b_islist1 ) { int mvs_offset = mvlen * (ref - b - 1); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1]; } else { int mvs_offset = mvlen * (b - ref - 1); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL ); h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1]; } h->opencl.num_copies++; return 0; } int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor ) { x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status; x264_frame_t *fenc = frames[b]; x264_frame_t *fref0 = frames[p0]; x264_frame_t *fref1 = frames[p1]; int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32; /* Tasks for this kernel: * 1. Select least cost mode (intra, ref0, ref1) * list_used 0, 1, 2, or 3. if B frame, do not allow intra * 2. if B frame, try bidir predictions. * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */ size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height }; size_t ldim_bidir[2]; size_t *ldims = NULL; int cost_local_size = 4; int satd_local_size = 4; if( b < p1 ) { /* For B frames, use 4 threads per MB for BIDIR checks */ ldims = ldim_bidir; gdims[0] <<= 2; optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device ); int mb_per_group = (ldims[0] >> 2) * ldims[1]; cost_local_size = 4 * mb_per_group * sizeof(int16_t); satd_local_size = 16 * mb_per_group * sizeof(uint32_t); } cl_uint arg = 0; OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 ); OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL ); /* Sum costs across rows, atomicAdd down frame */ size_t gdim[2] = { 256, h->mb.i_mb_height }; size_t ldim[2] = { 256, 1 }; arg = 0; OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 ); OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 ); OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL ); if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 ) x264_opencl_flush( h ); int size = h->mb.i_mb_count * sizeof(int16_t); char *locked = opencl_alloc_locked( h, size ); h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = size; OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.num_copies++; size = h->mb.i_mb_height * sizeof(int); locked = opencl_alloc_locked( h, size ); h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = size; OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.num_copies++; size = 4 * sizeof(int); locked = opencl_alloc_locked( h, size ); OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL ); h->opencl.last_buf = !h->opencl.last_buf; h->opencl.copies[h->opencl.num_copies].src = locked; h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int); h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b]; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; if( b == p1 ) // P frames only { h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int); h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0]; h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int); h->opencl.num_copies++; } return 0; } void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda ) { if( h->param.b_opencl ) { #ifdef _WIN32 /* Temporarily boost priority of this lookahead thread and the OpenCL * driver's thread until the end of this function. On AMD GPUs this * greatly reduces the latency of enqueuing kernels and getting results * on Windows. */ HANDLE id = GetCurrentThread(); h->opencl.lookahead_thread_pri = GetThreadPriority( id ); SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL ); x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL ); if( status == CL_SUCCESS ) { h->opencl.opencl_thread_pri = GetThreadPriority( id ); SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL ); } #endif /* precalculate intra and I frames */ for( int i = 0; i <= num_frames; i++ ) x264_opencl_lowres_init( h, frames[i], lambda ); x264_opencl_flush( h ); if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe ) { /* For trellis B-Adapt, precompute exhaustive motion searches */ for( int b = 0; b <= num_frames; b++ ) { for( int j = 1; j < h->param.i_bframe; j++ ) { int p0 = b - j; if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF ) { const x264_weight_t *w = x264_weight_none; if( h->param.analyse.i_weighted_pred ) { x264_emms(); x264_weights_analyse( h, frames[b], frames[p0], 1 ); w = frames[b]->weight[0]; } frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w ); } int p1 = b + j; if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF ) { frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL ); } } } x264_opencl_flush( h ); } } } void x264_opencl_slicetype_end( x264_t *h ) { #ifdef _WIN32 if( h->param.b_opencl ) { HANDLE id = GetCurrentThread(); SetThreadPriority( id, h->opencl.lookahead_thread_pri ); x264_opencl_function_t *ocl = h->opencl.ocl; cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL ); if( status == CL_SUCCESS ) SetThreadPriority( id, h->opencl.opencl_thread_pri ); } #endif } int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b ) { if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) ) return 0; else { int do_search[2]; int dist_scale_factor = 128; const x264_weight_t *w = x264_weight_none; // avoid duplicating work frames[b]->i_cost_est[b-p0][p1-b] = 0; do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; if( do_search[0] ) { if( h->param.analyse.i_weighted_pred && b == p1 ) { x264_emms(); x264_weights_analyse( h, frames[b], frames[p0], 1 ); w = frames[b]->weight[0]; } frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; } if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0; if( b == p1 ) frames[b]->i_intra_mbs[b-p0] = 0; if( p1 != p0 ) dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); frames[b]->i_cost_est[b-p0][p1-b] = 0; frames[b]->i_cost_est_aq[b-p0][p1-b] = 0; x264_opencl_lowres_init( h, frames[b], lambda ); if( do_search[0] ) { x264_opencl_lowres_init( h, frames[p0], lambda ); x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w ); } if( do_search[1] ) { x264_opencl_lowres_init( h, frames[p1], lambda ); x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL ); } x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor ); return 1; } } #endif x264-master/encoder/slicetype-cl.h000066400000000000000000000046201502133446700172460ustar00rootroot00000000000000/***************************************************************************** * slicetype-cl.h: OpenCL slicetype decision code (lowres lookahead) ***************************************************************************** * Copyright (C) 2017-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_ENCODER_SLICETYPE_CL_H #define X264_ENCODER_SLICETYPE_CL_H #define x264_opencl_lowres_init x264_template(opencl_lowres_init) int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda ); #define x264_opencl_motionsearch x264_template(opencl_motionsearch) int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w ); #define x264_opencl_finalize_cost x264_template(opencl_finalize_cost) int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor ); #define x264_opencl_precalculate_frame_cost x264_template(opencl_precalculate_frame_cost) int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b ); #define x264_opencl_flush x264_template(opencl_flush) void x264_opencl_flush( x264_t *h ); #define x264_opencl_slicetype_prep x264_template(opencl_slicetype_prep) void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda ); #define x264_opencl_slicetype_end x264_template(opencl_slicetype_end) void x264_opencl_slicetype_end( x264_t *h ); #endif x264-master/encoder/slicetype.c000066400000000000000000002442511502133446700166530ustar00rootroot00000000000000/***************************************************************************** * slicetype.c: lookahead analysis ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Fiona Glaser * Loren Merritt * Dylan Yudaken * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" #include "me.h" // Indexed by pic_struct values static const uint8_t delta_tfi_divisor[10] = { 0, 2, 1, 1, 2, 2, 3, 3, 4, 6 }; static int slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b ); #define x264_weights_analyse x264_template(weights_analyse) void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); #if HAVE_OPENCL #include "slicetype-cl.h" #endif static void lowres_context_init( x264_t *h, x264_mb_analysis_t *a ) { a->i_qp = X264_LOOKAHEAD_QP; a->i_lambda = x264_lambda_tab[ a->i_qp ]; mb_analyse_load_costs( h, a ); if( h->param.analyse.i_subpel_refine > 1 ) { h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); h->mb.i_subpel_refine = 4; } else { h->mb.i_me_method = X264_ME_DIA; h->mb.i_subpel_refine = 2; } h->mb.b_chroma_me = 0; } /* makes a non-h264 weight (i.e. fix7), into an h264 weight */ static void weight_get_h264( int weight_nonh264, int offset, x264_weight_t *w ) { w->i_offset = offset; w->i_denom = 7; w->i_scale = weight_nonh264; while( w->i_denom > 0 && (w->i_scale > 127) ) { w->i_denom--; w->i_scale >>= 1; } w->i_scale = X264_MIN( w->i_scale, 127 ); } static NOINLINE pixel *weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dest ) { int ref0_distance = fenc->i_frame - ref->i_frame - 1; /* Note: this will never run during lookahead as weights_analyse is only called if no * motion search has been done. */ if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF ) { int i_stride = fenc->i_stride_lowres; int i_lines = fenc->i_lines_lowres; int i_width = fenc->i_width_lowres; int i_mb_xy = 0; pixel *p = dest; for( int y = 0; y < i_lines; y += 8, p += i_stride*8 ) for( int x = 0; x < i_width; x += 8, i_mb_xy++ ) { int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0]; int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1]; h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride, mvx+(x<<2), mvy+(y<<2), 8, 8, x264_weight_none ); } x264_emms(); return dest; } x264_emms(); return ref->lowres[0]; } /* How data is organized for 4:2:0/4:2:2 chroma weightp: * [U: ref] [U: fenc] * [V: ref] [V: fenc] * fenc = ref + offset * v = u + stride * chroma height */ static NOINLINE void weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv ) { int ref0_distance = fenc->i_frame - ref->i_frame - 1; int i_stride = fenc->i_stride[1]; int i_lines = fenc->i_lines[1]; int i_width = fenc->i_width[1]; int v_shift = CHROMA_V_SHIFT; int cw = 8*h->mb.i_mb_width; int ch = 16*h->mb.i_mb_height >> v_shift; int height = 16 >> v_shift; if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF ) { x264_frame_expand_border_chroma( h, ref, 1 ); for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride ) for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 ) { pixel *pixu = dstu + pel_offset_y + pel_offset_x; pixel *pixv = dstv + pel_offset_y + pel_offset_x; pixel *src1 = ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */ int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0]; int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1]; h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height ); } } else h->mc.plane_copy_deinterleave( dstu, i_stride, dstv, i_stride, ref->plane[1], i_stride, cw, ch ); h->mc.plane_copy_deinterleave( dstu+i_width, i_stride, dstv+i_width, i_stride, fenc->plane[1], i_stride, cw, ch ); x264_emms(); } static NOINLINE pixel *weight_cost_init_chroma444( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dst, int p ) { int ref0_distance = fenc->i_frame - ref->i_frame - 1; int i_stride = fenc->i_stride[p]; int i_lines = fenc->i_lines[p]; int i_width = fenc->i_width[p]; if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF ) { x264_frame_expand_border_chroma( h, ref, p ); for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 16, pel_offset_y = y*i_stride ) for( int x = 0, pel_offset_x = 0; x < i_width; x += 16, mb_xy++, pel_offset_x += 16 ) { pixel *pix = dst + pel_offset_y + pel_offset_x; pixel *src = ref->plane[p] + pel_offset_y + pel_offset_x; int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0] / 2; int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1] / 2; /* We don't want to calculate hpels for fenc frames, so we round the motion * vectors to fullpel here. It's not too bad, I guess? */ h->mc.copy_16x16_unaligned( pix, i_stride, src+mvx+mvy*i_stride, i_stride, 16 ); } x264_emms(); return dst; } x264_emms(); return ref->plane[p]; } static int weight_slice_header_cost( x264_t *h, x264_weight_t *w, int b_chroma ) { /* Add cost of weights in the slice header. */ int lambda = x264_lambda_tab[X264_LOOKAHEAD_QP]; /* 4 times higher, because chroma is analyzed at full resolution. */ if( b_chroma ) lambda *= 4; int numslices; if( h->param.i_slice_count ) numslices = h->param.i_slice_count; else if( h->param.i_slice_max_mbs ) numslices = (h->mb.i_mb_width * h->mb.i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs; else numslices = 1; /* FIXME: find a way to account for --slice-max-size? * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used. * Cut denom cost in half if chroma, since it's shared between the two chroma planes. */ int denom_cost = bs_size_ue( w[0].i_denom ) * (2 - b_chroma); return lambda * numslices * ( 10 + denom_cost + 2 * (bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset )) ); } static NOINLINE unsigned int weight_cost_luma( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w ) { unsigned int cost = 0; int i_stride = fenc->i_stride_lowres; int i_lines = fenc->i_lines_lowres; int i_width = fenc->i_width_lowres; pixel *fenc_plane = fenc->lowres[0]; ALIGNED_ARRAY_16( pixel, buf,[8*8] ); int pixoff = 0; int i_mb = 0; if( w ) { for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8) { w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 ); int cmp = h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ); cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] ); } cost += weight_slice_header_cost( h, w, 0 ); } else for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 ) { int cmp = h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ); cost += X264_MIN( cmp, fenc->i_intra_cost[i_mb] ); } x264_emms(); return cost; } static NOINLINE unsigned int weight_cost_chroma( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w ) { unsigned int cost = 0; int i_stride = fenc->i_stride[1]; int i_lines = fenc->i_lines[1]; int i_width = fenc->i_width[1]; pixel *src = ref + i_width; ALIGNED_ARRAY_16( pixel, buf, [8*16] ); int pixoff = 0; int height = 16 >> CHROMA_V_SHIFT; if( w ) { for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, pixoff += 8 ) { w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height ); /* The naive and seemingly sensible algorithm is to use mbcmp as in luma. * But testing shows that for chroma the DC coefficient is by far the most * important part of the coding cost. Thus a more useful chroma weight is * obtained by comparing each block's DC coefficient instead of the actual * pixels. */ cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height ); } cost += weight_slice_header_cost( h, w, 1 ); } else for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, pixoff += 8 ) cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height ); x264_emms(); return cost; } static NOINLINE unsigned int weight_cost_chroma444( x264_t *h, x264_frame_t *fenc, pixel *ref, x264_weight_t *w, int p ) { unsigned int cost = 0; int i_stride = fenc->i_stride[p]; int i_lines = fenc->i_lines[p]; int i_width = fenc->i_width[p]; pixel *src = fenc->plane[p]; ALIGNED_ARRAY_64( pixel, buf, [16*16] ); int pixoff = 0; if( w ) { for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 16, pixoff += 16 ) { w->weightfn[16>>2]( buf, 16, &ref[pixoff], i_stride, w, 16 ); cost += h->pixf.mbcmp[PIXEL_16x16]( buf, 16, &src[pixoff], i_stride ); } cost += weight_slice_header_cost( h, w, 1 ); } else for( int y = 0; y < i_lines; y += 16, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 16, pixoff += 16 ) cost += h->pixf.mbcmp[PIXEL_16x16]( &ref[pixoff], i_stride, &src[pixoff], i_stride ); x264_emms(); return cost; } void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ) { int i_delta_index = fenc->i_frame - ref->i_frame - 1; /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ const float epsilon = 1.f/128.f; x264_weight_t *weights = fenc->weight[0]; SET_WEIGHT( weights[0], 0, 1, 0, 0 ); SET_WEIGHT( weights[1], 0, 1, 0, 0 ); SET_WEIGHT( weights[2], 0, 1, 0, 0 ); int chroma_initted = 0; float guess_scale[3]; float fenc_mean[3]; float ref_mean[3]; for( int plane = 0; plane <= 2*!b_lookahead; plane++ ) { if( !plane || CHROMA_FORMAT ) { int zero_bias = !ref->i_pixel_ssd[plane]; float fenc_var = fenc->i_pixel_ssd[plane] + zero_bias; float ref_var = ref->i_pixel_ssd[plane] + zero_bias; guess_scale[plane] = sqrtf( fenc_var / ref_var ); fenc_mean[plane] = (float)(fenc->i_pixel_sum[plane] + zero_bias) / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); ref_mean[plane] = (float)( ref->i_pixel_sum[plane] + zero_bias) / (fenc->i_lines[!!plane] * fenc->i_width[!!plane]) / (1 << (BIT_DEPTH - 8)); } else { guess_scale[plane] = 1; fenc_mean[plane] = 0; ref_mean[plane] = 0; } } int chroma_denom = 7; if( !b_lookahead ) { /* make sure both our scale factors fit */ while( chroma_denom > 0 ) { float thresh = 127.f / (1< 127 ) { weights[1].weightfn = weights[2].weightfn = NULL; break; } } else weight_get_h264( round( guess_scale[plane] * 128 ), 0, &weights[plane] ); found = 0; mindenom = weights[plane].i_denom; minscale = weights[plane].i_scale; minoff = 0; pixel *mcbuf; if( !plane ) { if( !fenc->b_intra_calculated ) { x264_mb_analysis_t a; lowres_context_init( h, &a ); slicetype_frame_cost( h, &a, &fenc, 0, 0, 0 ); } mcbuf = weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] ); origscore = minscore = weight_cost_luma( h, fenc, mcbuf, NULL ); } else { if( CHROMA444 ) { mcbuf = weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane ); origscore = minscore = weight_cost_chroma444( h, fenc, mcbuf, NULL, plane ); } else { pixel *dstu = h->mb.p_weight_buf[0]; pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1]; if( !chroma_initted++ ) weight_cost_init_chroma( h, fenc, ref, dstu, dstv ); mcbuf = plane == 1 ? dstu : dstv; origscore = minscore = weight_cost_chroma( h, fenc, mcbuf, NULL ); } } if( !minscore ) continue; /* Picked somewhat arbitrarily */ static const uint8_t weight_check_distance[][2] = { {0,0},{0,0},{0,1},{0,1}, {0,1},{0,1},{0,1},{1,1}, {1,1},{2,1},{2,1},{4,2} }; int scale_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][0]; int offset_dist = b_lookahead ? 0 : weight_check_distance[h->param.analyse.i_subpel_refine][1]; int start_scale = x264_clip3( minscale - scale_dist, 0, 127 ); int end_scale = x264_clip3( minscale + scale_dist, 0, 127 ); for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ ) { int cur_scale = i_scale; int cur_offset = fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead; if( cur_offset < - 128 || cur_offset > 127 ) { /* Rescale considering the constraints on cur_offset. We do it in this order * because scale has a much wider range than offset (because of denom), so * it should almost never need to be clamped. */ cur_offset = x264_clip3( cur_offset, -128, 127 ); cur_scale = x264_clip3f( (1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f, 0, 127 ); } int start_offset = x264_clip3( cur_offset - offset_dist, -128, 127 ); int end_offset = x264_clip3( cur_offset + offset_dist, -128, 127 ); for( int i_off = start_offset; i_off <= end_offset; i_off++ ) { SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off ); unsigned int s; if( plane ) { if( CHROMA444 ) s = weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane ); else s = weight_cost_chroma( h, fenc, mcbuf, &weights[plane] ); } else s = weight_cost_luma( h, fenc, mcbuf, &weights[plane] ); COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 ); // Don't check any more offsets if the previous one had a lower cost than the current one if( minoff == start_offset && i_off != start_offset ) break; } } x264_emms(); /* Use a smaller denominator if possible */ if( !plane ) { while( mindenom > 0 && !(minscale&1) ) { mindenom--; minscale >>= 1; } } /* FIXME: More analysis can be done here on SAD vs. SATD termination. */ /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */ if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f ) { SET_WEIGHT( weights[plane], 0, 1, 0, 0 ); continue; } else SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff ); if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane ) fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore; } /* Optimize and unify denominator */ if( weights[1].weightfn || weights[2].weightfn ) { int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom; int both_weighted = weights[1].weightfn && weights[2].weightfn; /* If only one plane is weighted, the other has an implicit scale of 1< 0 && !(weights[1].weightfn && (weights[1].i_scale&1)) && !(weights[2].weightfn && (weights[2].i_scale&1))) ) { denom--; for( int i = 1; i <= 2; i++ ) if( weights[i].weightfn ) { weights[i].i_scale >>= 1; weights[i].i_denom = denom; } } } for( int i = 1; i <= 2; i++ ) if( weights[i].weightfn ) h->mc.weight_cache( h, &weights[i] ); if( weights[0].weightfn && b_lookahead ) { //scale lowres in lookahead for slicetype_frame_cost pixel *src = ref->buffer_lowres; pixel *dst = h->mb.p_weight_buf[0]; int width = ref->i_width_lowres + PADH2; int height = ref->i_lines_lowres + PADV*2; x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres, width, height, &weights[0] ); fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH_ALIGN + ref->i_stride_lowres * PADV; } } /* Output buffers are separated by 128 bytes to avoid false sharing of cachelines * in multithreaded lookahead. */ #define PAD_SIZE 32 /* cost_est, cost_est_aq, intra_mbs, num rows */ #define NUM_INTS 4 #define COST_EST 0 #define COST_EST_AQ 1 #define INTRA_MBS 2 #define NUM_ROWS 3 #define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start)) static void slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor, int do_search[2], const x264_weight_t *w, int *output_inter, int *output_intra ) { x264_frame_t *fref0 = frames[p0]; x264_frame_t *fref1 = frames[p1]; x264_frame_t *fenc = frames[b]; const int b_bidir = (b < p1); const int i_mb_x = h->mb.i_mb_x; const int i_mb_y = h->mb.i_mb_y; const int i_mb_stride = h->mb.i_mb_width; const int i_mb_xy = i_mb_x + i_mb_y * i_mb_stride; const int i_stride = fenc->i_stride_lowres; const int i_pel_offset = 8 * (i_mb_x + i_mb_y * i_stride); const int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; int16_t (*fenc_mvs[2])[2] = { b != p0 ? &fenc->lowres_mvs[0][b-p0-1][i_mb_xy] : NULL, b != p1 ? &fenc->lowres_mvs[1][p1-b-1][i_mb_xy] : NULL }; int (*fenc_costs[2]) = { b != p0 ? &fenc->lowres_mv_costs[0][b-p0-1][i_mb_xy] : NULL, b != p1 ? &fenc->lowres_mv_costs[1][p1-b-1][i_mb_xy] : NULL }; int b_frame_score_mb = (i_mb_x > 0 && i_mb_x < h->mb.i_mb_width - 1 && i_mb_y > 0 && i_mb_y < h->mb.i_mb_height - 1) || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2; ALIGNED_ARRAY_16( pixel, pix1,[9*FDEC_STRIDE] ); pixel *pix2 = pix1+8; x264_me_t m[2]; int i_bcost = COST_MAX; int list_used = 0; /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ int lowres_penalty = 4; h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, &fenc->lowres[0][i_pel_offset], i_stride, 8 ); if( p0 == p1 ) goto lowres_intra_mb; int mv_range = 2 * h->param.analyse.i_mv_range; // no need for h->mb.mv_min[] h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range ); h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 ); h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2; h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2; if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 ) { h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range ); h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 ); h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2; h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2; } #define LOAD_HPELS_LUMA(dst, src) \ { \ (dst)[0] = &(src)[0][i_pel_offset]; \ (dst)[1] = &(src)[1][i_pel_offset]; \ (dst)[2] = &(src)[2][i_pel_offset]; \ (dst)[3] = &(src)[3][i_pel_offset]; \ } #define LOAD_WPELS_LUMA(dst,src) \ (dst) = &(src)[i_pel_offset]; #define CLIP_MV( mv ) \ { \ mv[0] = x264_clip3( mv[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); \ mv[1] = x264_clip3( mv[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); \ } #define TRY_BIDIR( mv0, mv1, penalty ) \ { \ int i_cost; \ if( h->param.analyse.i_subpel_refine <= 1 ) \ { \ int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \ int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \ pixel *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \ pixel *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \ } \ else \ { \ intptr_t stride1 = 16, stride2 = 16; \ pixel *src1, *src2; \ src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \ (mv0)[0], (mv0)[1], 8, 8, w ); \ src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \ (mv1)[0], (mv1)[1], 8, 8, w ); \ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \ } \ i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \ m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \ COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \ } m[0].i_pixel = PIXEL_8x8; m[0].p_cost_mv = a->p_cost_mv; m[0].i_stride[0] = i_stride; m[0].p_fenc[0] = h->mb.pic.p_fenc[0]; m[0].weight = w; m[0].i_ref = 0; LOAD_HPELS_LUMA( m[0].p_fref, fref0->lowres ); m[0].p_fref_w = m[0].p_fref[0]; if( w[0].weightfn ) LOAD_WPELS_LUMA( m[0].p_fref_w, fenc->weighted[0] ); if( b_bidir ) { ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] ); m[1].i_pixel = PIXEL_8x8; m[1].p_cost_mv = a->p_cost_mv; m[1].i_stride[0] = i_stride; m[1].p_fenc[0] = h->mb.pic.p_fenc[0]; m[1].i_ref = 0; m[1].weight = x264_weight_none; LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres ); m[1].p_fref_w = m[1].p_fref[0]; if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF ) { int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy]; dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8; dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8; dmv[1][0] = dmv[0][0] - mvr[0]; dmv[1][1] = dmv[0][1] - mvr[1]; CLIP_MV( dmv[0] ); CLIP_MV( dmv[1] ); if( h->param.analyse.i_subpel_refine <= 1 ) M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */ } else M64( dmv ) = 0; TRY_BIDIR( dmv[0], dmv[1], 0 ); if( M64( dmv ) ) { int i_cost; h->mc.avg[PIXEL_8x8]( pix1, 16, m[0].p_fref[0], m[0].i_stride[0], m[1].p_fref[0], m[1].i_stride[0], i_bipred_weight ); i_cost = h->pixf.mbcmp[PIXEL_8x8]( m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); } } for( int l = 0; l < 1 + b_bidir; l++ ) { if( do_search[l] ) { int i_mvc = 0; int16_t (*fenc_mv)[2] = fenc_mvs[l]; ALIGNED_ARRAY_8( int16_t, mvc,[4],[2] ); /* Reverse-order MV prediction. */ M32( mvc[0] ) = 0; M32( mvc[2] ) = 0; #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; } if( i_mb_x < h->mb.i_mb_width - 1 ) MVC( fenc_mv[1] ); if( i_mb_y < h->i_threadslice_end - 1 ) { MVC( fenc_mv[i_mb_stride] ); if( i_mb_x > 0 ) MVC( fenc_mv[i_mb_stride-1] ); if( i_mb_x < h->mb.i_mb_width - 1 ) MVC( fenc_mv[i_mb_stride+1] ); } #undef MVC if( i_mvc <= 1 ) CP32( m[l].mvp, mvc[0] ); else x264_median_mv( m[l].mvp, mvc[0], mvc[1], mvc[2] ); /* Fast skip for cases of near-zero residual. Shortcut: don't bother except in the mv0 case, * since anything else is likely to have enough residual to not trigger the skip. */ if( !M32( m[l].mvp ) ) { m[l].cost = h->pixf.mbcmp[PIXEL_8x8]( m[l].p_fenc[0], FENC_STRIDE, m[l].p_fref[0], m[l].i_stride[0] ); if( m[l].cost < 64 ) { M32( m[l].mv ) = 0; goto skip_motionest; } } x264_me_search( h, &m[l], mvc, i_mvc ); m[l].cost -= a->p_cost_mv[0]; // remove mvcost from skip mbs if( M32( m[l].mv ) ) m[l].cost += 5 * a->i_lambda; skip_motionest: CP32( fenc_mvs[l], m[l].mv ); *fenc_costs[l] = m[l].cost; } else { CP32( m[l].mv, fenc_mvs[l] ); m[l].cost = *fenc_costs[l]; } COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 ); } if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); lowres_intra_mb: if( !fenc->b_intra_calculated ) { ALIGNED_ARRAY_16( pixel, edge,[36] ); pixel *pix = &pix1[8+FDEC_STRIDE]; pixel *src = &fenc->lowres[0][i_pel_offset]; const int intra_penalty = 5 * a->i_lambda; int satds[3]; int pixoff = 4 / SIZEOF_PIXEL; /* Avoid store forwarding stalls by writing larger chunks */ memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * SIZEOF_PIXEL ); for( int i = -1; i < 8; i++ ) M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] ); h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); int i_icost = X264_MIN3( satds[0], satds[1], satds[2] ); if( h->param.analyse.i_subpel_refine > 1 ) { h->predict_8x8c[I_PRED_CHROMA_P]( pix ); int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); for( int i = 3; i < 9; i++ ) { h->predict_8x8[i]( pix, edge ); satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); } } i_icost = ((i_icost + intra_penalty) >> (BIT_DEPTH - 8)) + lowres_penalty; fenc->i_intra_cost[i_mb_xy] = i_icost; int i_icost_aq = i_icost; if( h->param.rc.i_aq_mode ) i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; output_intra[ROW_SATD] += i_icost_aq; if( b_frame_score_mb ) { output_intra[COST_EST] += i_icost; output_intra[COST_EST_AQ] += i_icost_aq; } } i_bcost = (i_bcost >> (BIT_DEPTH - 8)) + lowres_penalty; /* forbid intra-mbs in B-frames, because it's rare and not worth checking */ /* FIXME: Should we still forbid them now that we cache intra scores? */ if( !b_bidir ) { int i_icost = fenc->i_intra_cost[i_mb_xy]; int b_intra = i_icost < i_bcost; if( b_intra ) { i_bcost = i_icost; list_used = 0; } if( b_frame_score_mb ) output_inter[INTRA_MBS] += b_intra; } /* In an I-frame, we've already added the results above in the intra section. */ if( p0 != p1 ) { int i_bcost_aq = i_bcost; if( h->param.rc.i_aq_mode ) i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8; output_inter[ROW_SATD] += i_bcost_aq; if( b_frame_score_mb ) { /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */ output_inter[COST_EST] += i_bcost; output_inter[COST_EST_AQ] += i_bcost_aq; } } fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); } #undef TRY_BIDIR #define NUM_MBS\ (h->mb.i_mb_width > 2 && h->mb.i_mb_height > 2 ?\ (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\ h->mb.i_mb_width * h->mb.i_mb_height) typedef struct { x264_t *h; x264_mb_analysis_t *a; x264_frame_t **frames; int p0; int p1; int b; int dist_scale_factor; int *do_search; const x264_weight_t *w; int *output_inter; int *output_intra; } x264_slicetype_slice_t; static void slicetype_slice_cost( x264_slicetype_slice_t *s ) { x264_t *h = s->h; /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode. * This considerably improves MV prediction overall. */ /* The edge mbs seem to reduce the predictive quality of the * whole frame's score, but are needed for a spatial distribution. */ int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2; int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges ); int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges ); int start_x = h->mb.i_mb_width - 2 + do_edges; int end_x = 1 - do_edges; for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- ) for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- ) slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor, s->do_search, s->w, s->output_inter, s->output_intra ); } static int slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b ) { int i_score = 0; int do_search[2]; const x264_weight_t *w = x264_weight_none; x264_frame_t *fenc = frames[b]; /* Check whether we already evaluated this frame * If we have tried this frame as P, then we have also tried * the preceding frames as B. (is this still true?) */ /* Also check that we already calculated the row SATDs for the current frame. */ if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) ) i_score = fenc->i_cost_est[b-p0][p1-b]; else { int dist_scale_factor = 128; /* For each list, check to see whether we have lowres motion-searched this reference frame before. */ do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF; do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF; if( do_search[0] ) { if( h->param.analyse.i_weighted_pred && b == p1 ) { x264_emms(); x264_weights_analyse( h, fenc, frames[p0], 1 ); w = fenc->weight[0]; } fenc->lowres_mvs[0][b-p0-1][0][0] = 0; } if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0; if( p1 != p0 ) dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads; int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1]; int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1]; output_inter[0] = h->scratch_buffer2; output_intra[0] = output_inter[0] + output_buf_size; #if HAVE_OPENCL if( h->param.b_opencl ) { x264_opencl_lowres_init(h, fenc, a->i_lambda ); if( do_search[0] ) { x264_opencl_lowres_init( h, frames[p0], a->i_lambda ); x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w ); } if( do_search[1] ) { x264_opencl_lowres_init( h, frames[p1], a->i_lambda ); x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL ); } if( b != p0 ) x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor ); x264_opencl_flush( h ); i_score = fenc->i_cost_est[b-p0][p1-b]; } else #endif { if( h->param.i_lookahead_threads > 1 ) { x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX]; for( int i = 0; i < h->param.i_lookahead_threads; i++ ) { x264_t *t = h->lookahead_thread[i]; /* FIXME move this somewhere else */ t->mb.i_me_method = h->mb.i_me_method; t->mb.i_subpel_refine = h->mb.i_subpel_refine; t->mb.b_chroma_me = h->mb.b_chroma_me; s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w, output_inter[i], output_intra[i] }; t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads); int thread_height = t->i_threadslice_end - t->i_threadslice_start; int thread_output_size = thread_height + NUM_INTS; memset( output_inter[i], 0, thread_output_size * sizeof(int) ); memset( output_intra[i], 0, thread_output_size * sizeof(int) ); output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height; output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE; output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE; x264_threadpool_run( h->lookaheadpool, (void*)slicetype_slice_cost, &s[i] ); } for( int i = 0; i < h->param.i_lookahead_threads; i++ ) x264_threadpool_wait( h->lookaheadpool, &s[i] ); } else { h->i_threadslice_start = 0; h->i_threadslice_end = h->mb.i_mb_height; memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) ); output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height; x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w, output_inter[0], output_intra[0] }; slicetype_slice_cost( &s ); } /* Sum up accumulators */ if( b == p1 ) fenc->i_intra_mbs[b-p0] = 0; if( !fenc->b_intra_calculated ) { fenc->i_cost_est[0][0] = 0; fenc->i_cost_est_aq[0][0] = 0; } fenc->i_cost_est[b-p0][p1-b] = 0; fenc->i_cost_est_aq[b-p0][p1-b] = 0; int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b]; int *row_satd_intra = fenc->i_row_satds[0][0]; for( int i = 0; i < h->param.i_lookahead_threads; i++ ) { if( b == p1 ) fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS]; if( !fenc->b_intra_calculated ) { fenc->i_cost_est[0][0] += output_intra[i][COST_EST]; fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ]; } fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST]; fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ]; if( h->param.rc.i_vbv_buffer_size ) { int row_count = output_inter[i][NUM_ROWS]; memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) ); if( !fenc->b_intra_calculated ) memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) ); row_satd_inter += row_count; row_satd_intra += row_count; } } i_score = fenc->i_cost_est[b-p0][p1-b]; if( b != p1 ) i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias); else fenc->b_intra_calculated = 1; fenc->i_cost_est[b-p0][p1-b] = i_score; x264_emms(); } } return i_score; } /* If MB-tree changes the quantizers, we need to recalculate the frame cost without * re-running lookahead. */ static int slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b ) { int i_score = 0; int *row_satd = frames[b]->i_row_satds[b-p0][p1-b]; float *qp_offset = IS_X264_TYPE_B(frames[b]->i_type) ? frames[b]->f_qp_offset_aq : frames[b]->f_qp_offset; x264_emms(); for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- ) { row_satd[ h->mb.i_mb_y ] = 0; for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- ) { int i_mb_xy = h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride; int i_mb_cost = frames[b]->lowres_costs[b-p0][p1-b][i_mb_xy] & LOWRES_COST_MASK; float qp_adj = qp_offset[i_mb_xy]; i_mb_cost = (i_mb_cost * x264_exp2fix8(qp_adj) + 128) >> 8; row_satd[ h->mb.i_mb_y ] += i_mb_cost; if( (h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->mb.i_mb_height - 1 && h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->mb.i_mb_width - 1) || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 ) { i_score += i_mb_cost; } } } return i_score; } /* Trade off precision in mbtree for increased range */ #define MBTREE_PRECISION 0.5f static void macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance ) { int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION ); float weightdelta = 0.0; if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 ) weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]); /* Allow the strength to be adjusted via qcompress, since the two * concepts are very similar. */ float strength = 5.0f * (1.0f - h->param.rc.f_qcompress); for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ ) { int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8; if( intra_cost ) { int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor + 128) >> 8; float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta; frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio; } } } static void macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced ) { uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost}; int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; int16_t (*mvs[2])[2] = { b != p0 ? frames[b]->lowres_mvs[0][b-p0-1] : NULL, b != p1 ? frames[b]->lowres_mvs[1][p1-b-1] : NULL }; int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight}; int16_t *buf = h->scratch_buffer; uint16_t *propagate_cost = frames[b]->i_propagate_cost; uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b]; x264_emms(); float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION; /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */ if( !referenced ) memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) ); for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->mb.i_mb_height; h->mb.i_mb_y++ ) { int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride; h->mc.mbtree_propagate_cost( buf, propagate_cost, frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index, frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width ); if( referenced ) propagate_cost += h->mb.i_mb_width; h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index], bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 ); if( b != p1 ) { h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index], bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 ); } } if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced ) macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 ); } static void macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra ) { int idx = !b_intra; int last_nonb, cur_nonb = 1; int bframes = 0; x264_emms(); float total_duration = 0.0; for( int j = 0; j <= num_frames; j++ ) total_duration += frames[j]->f_duration; float average_duration = total_duration / (num_frames + 1); int i = num_frames; if( b_intra ) slicetype_frame_cost( h, a, frames, 0, 0, 0 ); while( i > 0 && IS_X264_TYPE_B( frames[i]->i_type ) ) i--; last_nonb = i; /* Lookaheadless MB-tree is not a theoretically distinct case; the same extrapolation could * be applied to the end of a lookahead buffer of any size. However, it's most needed when * lookahead=0, so that's what's currently implemented. */ if( !h->param.rc.i_lookahead ) { if( b_intra ) { memset( frames[0]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) ); memcpy( frames[0]->f_qp_offset, frames[0]->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) ); return; } XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost ); memset( frames[0]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) ); } else { if( last_nonb < idx ) return; memset( frames[last_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) ); } while( i-- > idx ) { cur_nonb = i; while( IS_X264_TYPE_B( frames[cur_nonb]->i_type ) && cur_nonb > 0 ) cur_nonb--; if( cur_nonb < idx ) break; slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, last_nonb ); memset( frames[cur_nonb]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) ); bframes = last_nonb - cur_nonb - 1; if( h->param.i_bframe_pyramid && bframes > 1 ) { int middle = (bframes + 1)/2 + cur_nonb; slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, middle ); memset( frames[middle]->i_propagate_cost, 0, h->mb.i_mb_count * sizeof(uint16_t) ); while( i > cur_nonb ) { int p0 = i > middle ? middle : cur_nonb; int p1 = i < middle ? middle : last_nonb; if( i != middle ) { slicetype_frame_cost( h, a, frames, p0, p1, i ); macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 ); } i--; } macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 ); } else { while( i > cur_nonb ) { slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i ); macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 ); i--; } } macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 ); last_nonb = cur_nonb; } if( !h->param.rc.i_lookahead ) { slicetype_frame_cost( h, a, frames, 0, last_nonb, last_nonb ); macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 ); XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost ); } macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb ); if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size ) macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 ); } static int vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b ) { int cost = slicetype_frame_cost( h, a, frames, p0, p1, b ); if( h->param.rc.i_aq_mode ) { if( h->param.rc.b_mb_tree ) return slicetype_frame_cost_recalculate( h, frames, p0, p1, b ); else return frames[b]->i_cost_est_aq[b-p0][p1-b]; } return cost; } static void calculate_durations( x264_t *h, x264_frame_t *cur_frame, x264_frame_t *prev_frame, int64_t *i_cpb_delay, int64_t *i_coded_fields ) { cur_frame->i_cpb_delay = *i_cpb_delay; cur_frame->i_dpb_output_delay = cur_frame->i_field_cnt - *i_coded_fields; // add a correction term for frame reordering cur_frame->i_dpb_output_delay += h->sps->vui.i_num_reorder_frames*2; // fix possible negative dpb_output_delay because of pulldown changes and reordering if( cur_frame->i_dpb_output_delay < 0 ) { cur_frame->i_cpb_delay += cur_frame->i_dpb_output_delay; cur_frame->i_dpb_output_delay = 0; if( prev_frame ) prev_frame->i_cpb_duration += cur_frame->i_dpb_output_delay; } // don't reset cpb delay for IDR frames when using intra-refresh if( cur_frame->b_keyframe && !h->param.b_intra_refresh ) *i_cpb_delay = 0; *i_cpb_delay += cur_frame->i_duration; *i_coded_fields += cur_frame->i_duration; cur_frame->i_cpb_duration = cur_frame->i_duration; } static void vbv_lookahead( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int keyframe ) { int last_nonb = 0, cur_nonb = 1, idx = 0; x264_frame_t *prev_frame = NULL; int prev_frame_idx = 0; while( cur_nonb < num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) ) cur_nonb++; int next_nonb = keyframe ? last_nonb : cur_nonb; if( frames[cur_nonb]->i_coded_fields_lookahead >= 0 ) { h->i_coded_fields_lookahead = frames[cur_nonb]->i_coded_fields_lookahead; h->i_cpb_delay_lookahead = frames[cur_nonb]->i_cpb_delay_lookahead; } while( cur_nonb < num_frames ) { /* P/I cost: This shouldn't include the cost of next_nonb */ if( next_nonb != cur_nonb ) { int p0 = IS_X264_TYPE_I( frames[cur_nonb]->i_type ) ? cur_nonb : last_nonb; frames[next_nonb]->i_planned_satd[idx] = vbv_frame_cost( h, a, frames, p0, cur_nonb, cur_nonb ); frames[next_nonb]->i_planned_type[idx] = frames[cur_nonb]->i_type; frames[cur_nonb]->i_coded_fields_lookahead = h->i_coded_fields_lookahead; frames[cur_nonb]->i_cpb_delay_lookahead = h->i_cpb_delay_lookahead; calculate_durations( h, frames[cur_nonb], prev_frame, &h->i_cpb_delay_lookahead, &h->i_coded_fields_lookahead ); if( prev_frame ) { frames[next_nonb]->f_planned_cpb_duration[prev_frame_idx] = (double)prev_frame->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; } frames[next_nonb]->f_planned_cpb_duration[idx] = (double)frames[cur_nonb]->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; prev_frame = frames[cur_nonb]; prev_frame_idx = idx; idx++; } /* Handle the B-frames: coded order */ for( int i = last_nonb+1; i < cur_nonb; i++, idx++ ) { frames[next_nonb]->i_planned_satd[idx] = vbv_frame_cost( h, a, frames, last_nonb, cur_nonb, i ); frames[next_nonb]->i_planned_type[idx] = X264_TYPE_B; frames[i]->i_coded_fields_lookahead = h->i_coded_fields_lookahead; frames[i]->i_cpb_delay_lookahead = h->i_cpb_delay_lookahead; calculate_durations( h, frames[i], prev_frame, &h->i_cpb_delay_lookahead, &h->i_coded_fields_lookahead ); if( prev_frame ) { frames[next_nonb]->f_planned_cpb_duration[prev_frame_idx] = (double)prev_frame->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; } frames[next_nonb]->f_planned_cpb_duration[idx] = (double)frames[i]->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; prev_frame = frames[i]; prev_frame_idx = idx; } last_nonb = cur_nonb; cur_nonb++; while( cur_nonb <= num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) ) cur_nonb++; } frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO; } static uint64_t slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, char *path, uint64_t threshold ) { uint64_t cost = 0; int loc = 1; int cur_nonb = 0; path--; /* Since the 1st path element is really the second frame */ while( path[loc] ) { int next_nonb = loc; /* Find the location of the next non-B-frame. */ while( path[next_nonb] == 'B' ) next_nonb++; /* Add the cost of the non-B-frame found above */ if( path[next_nonb] == 'P' ) cost += slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_nonb ); else /* I-frame */ cost += slicetype_frame_cost( h, a, frames, next_nonb, next_nonb, next_nonb ); /* Early terminate if the cost we have found is larger than the best path cost so far */ if( cost > threshold ) break; if( h->param.i_bframe_pyramid && next_nonb - cur_nonb > 2 ) { int middle = cur_nonb + (next_nonb - cur_nonb)/2; cost += slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, middle ); for( int next_b = loc; next_b < middle && cost < threshold; next_b++ ) cost += slicetype_frame_cost( h, a, frames, cur_nonb, middle, next_b ); for( int next_b = middle+1; next_b < next_nonb && cost < threshold; next_b++ ) cost += slicetype_frame_cost( h, a, frames, middle, next_nonb, next_b ); } else for( int next_b = loc; next_b < next_nonb && cost < threshold; next_b++ ) cost += slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_b ); loc = next_nonb + 1; cur_nonb = next_nonb; } return cost; } /* Viterbi/trellis slicetype decision algorithm. */ /* Uses strings due to the fact that the speed of the control functions is negligible compared to the cost of running slicetype_frame_cost, and because it makes debugging easier. */ static void slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, char (*best_paths)[X264_LOOKAHEAD_MAX+1] ) { char paths[2][X264_LOOKAHEAD_MAX+1]; int num_paths = X264_MIN( h->param.i_bframe+1, length ); uint64_t best_cost = COST_MAX64; int best_possible = 0; int idx = 0; /* Iterate over all currently possible paths */ for( int path = 0; path < num_paths; path++ ) { /* Add suffixes to the current path */ int len = length - (path + 1); memcpy( paths[idx], best_paths[len % (X264_BFRAME_MAX+1)], len ); memset( paths[idx]+len, 'B', path ); strcpy( paths[idx]+len+path, "P" ); int possible = 1; for( int i = 1; i <= length; i++ ) { int i_type = frames[i]->i_type; if( i_type == X264_TYPE_AUTO ) continue; if( IS_X264_TYPE_B( i_type ) ) possible = possible && (i < len || i == length || paths[idx][i-1] == 'B'); else { possible = possible && (i < len || paths[idx][i-1] != 'B'); paths[idx][i-1] = IS_X264_TYPE_I( i_type ) ? 'I' : 'P'; } } if( possible || !best_possible ) { if( possible && !best_possible ) best_cost = COST_MAX64; /* Calculate the actual cost of the current path */ uint64_t cost = slicetype_path_cost( h, a, frames, paths[idx], best_cost ); if( cost < best_cost ) { best_cost = cost; best_possible = possible; idx ^= 1; } } } /* Store the best path. */ memcpy( best_paths[length % (X264_BFRAME_MAX+1)], paths[idx^1], length ); } static int scenecut_internal( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut ) { x264_frame_t *frame = frames[p1]; /* Don't do scenecuts on the right view of a frame-packed video. */ if( real_scenecut && h->param.i_frame_packing == 5 && (frame->i_frame&1) ) return 0; slicetype_frame_cost( h, a, frames, p0, p1, p1 ); int icost = frame->i_cost_est[0][0]; int pcost = frame->i_cost_est[p1-p0][0]; float f_bias; int i_gop_size = frame->i_frame - h->lookahead->i_last_keyframe; float f_thresh_max = h->param.i_scenecut_threshold / 100.0; /* magic numbers pulled out of thin air */ float f_thresh_min = f_thresh_max * 0.25; int res; if( h->param.i_keyint_min == h->param.i_keyint_max ) f_thresh_min = f_thresh_max; if( i_gop_size <= h->param.i_keyint_min / 4 || h->param.b_intra_refresh ) f_bias = f_thresh_min / 4; else if( i_gop_size <= h->param.i_keyint_min ) f_bias = f_thresh_min * i_gop_size / h->param.i_keyint_min; else { f_bias = f_thresh_min + ( f_thresh_max - f_thresh_min ) * ( i_gop_size - h->param.i_keyint_min ) / ( h->param.i_keyint_max - h->param.i_keyint_min ); } res = pcost >= (1.0 - f_bias) * icost; if( res && real_scenecut ) { int imb = frame->i_intra_mbs[p1-p0]; int pmb = NUM_MBS - imb; x264_log( h, X264_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", frame->i_frame, icost, pcost, 1. - (double)pcost / icost, f_bias, i_gop_size, imb, pmb ); } return res; } static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int real_scenecut, int num_frames, int i_max_search ) { /* Only do analysis during a normal scenecut check. */ if( real_scenecut && h->param.i_bframe ) { int origmaxp1 = p0 + 1; /* Look ahead to avoid coding short flashes as scenecuts. */ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) /* Don't analyse any more frames than the trellis would have covered. */ origmaxp1 += h->param.i_bframe; else origmaxp1++; int maxp1 = X264_MIN( origmaxp1, num_frames ); /* Where A and B are scenes: AAAAAABBBAAAAAA * If BBB is shorter than (maxp1-p0), it is detected as a flash * and not considered a scenecut. */ for( int curp1 = p1; curp1 <= maxp1; curp1++ ) if( !scenecut_internal( h, a, frames, p0, curp1, 0 ) ) /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */ for( int i = curp1; i > p0; i-- ) frames[i]->b_scenecut = 0; /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF * If each of BB ... EE are shorter than (maxp1-p0), they are * detected as flashes and not considered scenecuts. * Instead, the first F frame becomes a scenecut. * If the video ends before F, no frame becomes a scenecut. */ for( int curp0 = p0; curp0 <= maxp1; curp0++ ) if( origmaxp1 > i_max_search || (curp0 < maxp1 && scenecut_internal( h, a, frames, curp0, maxp1, 0 )) ) /* If cur_p0 is the p0 of a scenecut, it cannot be the p1 of a scenecut. */ frames[curp0]->b_scenecut = 0; } /* Ignore frames that are part of a flash, i.e. cannot be real scenecuts. */ if( !frames[p1]->b_scenecut ) return 0; return scenecut_internal( h, a, frames, p0, p1, real_scenecut ); } #define IS_X264_TYPE_AUTO_OR_I(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_I(x)) #define IS_X264_TYPE_AUTO_OR_B(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_B(x)) void x264_slicetype_analyse( x264_t *h, int intra_minigop ) { x264_mb_analysis_t a; x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, }; int num_frames, orig_num_frames, keyint_limit, framecnt; int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX ); int b_vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead; /* For determinism we should limit the search to the number of frames lookahead has for sure * in h->lookahead->next.list buffer, except at the end of stream. * For normal calls with (intra_minigop == 0) that is h->lookahead->i_slicetype_length + 1 frames. * And for I-frame calls (intra_minigop != 0) we already removed intra_minigop frames from there. */ if( h->param.b_deterministic ) i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + 1 - intra_minigop ); int keyframe = !!intra_minigop; assert( h->frames.b_have_lowres ); if( !h->lookahead->last_nonb ) return; frames[0] = h->lookahead->last_nonb; for( framecnt = 0; framecnt < i_max_search; framecnt++ ) frames[framecnt+1] = h->lookahead->next.list[framecnt]; lowres_context_init( h, &a ); if( !framecnt ) { if( h->param.rc.b_mb_tree ) macroblock_tree( h, &a, frames, 0, keyframe ); return; } keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_keyframe - 1; orig_num_frames = num_frames = h->param.b_intra_refresh ? framecnt : X264_MIN( framecnt, keyint_limit ); /* This is important psy-wise: if we have a non-scenecut keyframe, * there will be significant visual artifacts if the frames just before * go down in quality due to being referenced less, despite it being * more RD-optimal. */ if( (h->param.analyse.b_psy && h->param.rc.b_mb_tree) || b_vbv_lookahead ) num_frames = framecnt; else if( h->param.b_open_gop && num_frames < framecnt ) num_frames++; else if( num_frames == 0 ) { frames[1]->i_type = X264_TYPE_I; return; } if( IS_X264_TYPE_AUTO_OR_I( frames[1]->i_type ) && h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) ) { if( frames[1]->i_type == X264_TYPE_AUTO ) frames[1]->i_type = X264_TYPE_I; return; } #if HAVE_OPENCL x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda ); #endif /* Replace forced keyframes with I/IDR-frames */ for( int j = 1; j <= num_frames; j++ ) { if( frames[j]->i_type == X264_TYPE_KEYFRAME ) frames[j]->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; } /* Close GOP at IDR-frames */ for( int j = 2; j <= num_frames; j++ ) { if( frames[j]->i_type == X264_TYPE_IDR && IS_X264_TYPE_AUTO_OR_B( frames[j-1]->i_type ) ) frames[j-1]->i_type = X264_TYPE_P; } int num_analysed_frames = num_frames; int reset_start; if( h->param.i_bframe ) { if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) { if( num_frames > 1 ) { char best_paths[X264_BFRAME_MAX+1][X264_LOOKAHEAD_MAX+1] = {"","P"}; int best_path_index = num_frames % (X264_BFRAME_MAX+1); /* Perform the frametype analysis. */ for( int j = 2; j <= num_frames; j++ ) slicetype_path( h, &a, frames, j, best_paths ); /* Load the results of the analysis into the frame types. */ for( int j = 1; j < num_frames; j++ ) { if( best_paths[best_path_index][j-1] != 'B' ) { if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) frames[j]->i_type = X264_TYPE_P; } else { if( frames[j]->i_type == X264_TYPE_AUTO ) frames[j]->i_type = X264_TYPE_B; } } } } else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST ) { int last_nonb = 0; int num_bframes = h->param.i_bframe; char path[X264_LOOKAHEAD_MAX+1]; for( int j = 1; j < num_frames; j++ ) { if( j-1 > 0 && IS_X264_TYPE_B( frames[j-1]->i_type ) ) num_bframes--; else { last_nonb = j-1; num_bframes = h->param.i_bframe; } if( !num_bframes ) { if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) frames[j]->i_type = X264_TYPE_P; continue; } if( frames[j]->i_type != X264_TYPE_AUTO ) continue; if( IS_X264_TYPE_B( frames[j+1]->i_type ) ) { frames[j]->i_type = X264_TYPE_P; continue; } int bframes = j - last_nonb - 1; memset( path, 'B', bframes ); strcpy( path+bframes, "PP" ); uint64_t cost_p = slicetype_path_cost( h, &a, frames+last_nonb, path, COST_MAX64 ); strcpy( path+bframes, "BP" ); uint64_t cost_b = slicetype_path_cost( h, &a, frames+last_nonb, path, cost_p ); if( cost_b < cost_p ) frames[j]->i_type = X264_TYPE_B; else frames[j]->i_type = X264_TYPE_P; } } else { int num_bframes = h->param.i_bframe; for( int j = 1; j < num_frames; j++ ) { if( !num_bframes ) { if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) frames[j]->i_type = X264_TYPE_P; } else if( frames[j]->i_type == X264_TYPE_AUTO ) { if( IS_X264_TYPE_B( frames[j+1]->i_type ) ) frames[j]->i_type = X264_TYPE_P; else frames[j]->i_type = X264_TYPE_B; } if( IS_X264_TYPE_B( frames[j]->i_type ) ) num_bframes--; else num_bframes = h->param.i_bframe; } } if( IS_X264_TYPE_AUTO_OR_B( frames[num_frames]->i_type ) ) frames[num_frames]->i_type = X264_TYPE_P; int num_bframes = 0; while( num_bframes < num_frames && IS_X264_TYPE_B( frames[num_bframes+1]->i_type ) ) num_bframes++; /* Check scenecut on the first minigop. */ for( int j = 1; j < num_bframes+1; j++ ) { if( frames[j]->i_forced_type == X264_TYPE_AUTO && IS_X264_TYPE_AUTO_OR_I( frames[j+1]->i_forced_type ) && h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) ) { frames[j]->i_type = X264_TYPE_P; num_analysed_frames = j; break; } } reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 ); } else { for( int j = 1; j <= num_frames; j++ ) if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) frames[j]->i_type = X264_TYPE_P; reset_start = !keyframe + 1; } /* Perform the actual macroblock tree analysis. * Don't go farther than the maximum keyframe interval; this helps in short GOPs. */ if( h->param.rc.b_mb_tree ) macroblock_tree( h, &a, frames, X264_MIN(num_frames, h->param.i_keyint_max), keyframe ); /* Enforce keyframe limit. */ if( !h->param.b_intra_refresh ) { int last_keyframe = h->lookahead->i_last_keyframe; int last_possible = 0; for( int j = 1; j <= num_frames; j++ ) { x264_frame_t *frm = frames[j]; int keyframe_dist = frm->i_frame - last_keyframe; if( IS_X264_TYPE_AUTO_OR_I( frm->i_forced_type ) ) { if( h->param.b_open_gop || !IS_X264_TYPE_B( frames[j-1]->i_forced_type ) ) last_possible = j; } if( keyframe_dist >= h->param.i_keyint_max ) { if( last_possible != 0 && last_possible != j ) { j = last_possible; frm = frames[j]; keyframe_dist = frm->i_frame - last_keyframe; } last_possible = 0; if( frm->i_type != X264_TYPE_IDR ) frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; } if( frm->i_type == X264_TYPE_I && keyframe_dist >= h->param.i_keyint_min ) { if( h->param.b_open_gop ) { last_keyframe = frm->i_frame; if( h->param.b_bluray_compat ) { // Use bluray order int bframes = 0; while( bframes < j-1 && IS_X264_TYPE_B( frames[j-1-bframes]->i_type ) ) bframes++; last_keyframe -= bframes; } } else if( frm->i_forced_type != X264_TYPE_I ) frm->i_type = X264_TYPE_IDR; } if( frm->i_type == X264_TYPE_IDR ) { last_keyframe = frm->i_frame; if( j > 1 && IS_X264_TYPE_B( frames[j-1]->i_type ) ) frames[j-1]->i_type = X264_TYPE_P; } } } if( b_vbv_lookahead ) vbv_lookahead( h, &a, frames, num_frames, keyframe ); /* Restore frametypes for all frames that haven't actually been decided yet. */ for( int j = reset_start; j <= num_frames; j++ ) frames[j]->i_type = frames[j]->i_forced_type; #if HAVE_OPENCL x264_opencl_slicetype_end( h ); #endif } void x264_slicetype_decide( x264_t *h ) { x264_frame_t *frames[X264_BFRAME_MAX+2]; x264_frame_t *frm; int bframes; int brefs; if( !h->lookahead->next.i_size ) return; int lookahead_size = h->lookahead->next.i_size; for( int i = 0; i < h->lookahead->next.i_size; i++ ) { if( h->param.b_vfr_input ) { if( lookahead_size-- > 1 ) h->lookahead->next.list[i]->i_duration = 2 * (h->lookahead->next.list[i+1]->i_pts - h->lookahead->next.list[i]->i_pts); else h->lookahead->next.list[i]->i_duration = h->i_prev_duration; } else h->lookahead->next.list[i]->i_duration = delta_tfi_divisor[h->lookahead->next.list[i]->i_pic_struct]; h->i_prev_duration = h->lookahead->next.list[i]->i_duration; h->lookahead->next.list[i]->f_duration = (double)h->lookahead->next.list[i]->i_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; if( h->lookahead->next.list[i]->i_frame > h->i_disp_fields_last_frame && lookahead_size > 0 ) { h->lookahead->next.list[i]->i_field_cnt = h->i_disp_fields; h->i_disp_fields += h->lookahead->next.list[i]->i_duration; h->i_disp_fields_last_frame = h->lookahead->next.list[i]->i_frame; } else if( lookahead_size == 0 ) { h->lookahead->next.list[i]->i_field_cnt = h->i_disp_fields; h->lookahead->next.list[i]->i_duration = h->i_prev_duration; } } if( h->param.rc.b_stat_read ) { /* Use the frame types from the first pass */ for( int i = 0; i < h->lookahead->next.i_size; i++ ) h->lookahead->next.list[i]->i_type = x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame ); } else if( (h->param.i_bframe && h->param.i_bframe_adaptive) || h->param.i_scenecut_threshold || h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) ) x264_slicetype_analyse( h, 0 ); for( bframes = 0, brefs = 0;; bframes++ ) { frm = h->lookahead->next.list[bframes]; if( frm->i_forced_type != X264_TYPE_AUTO && frm->i_type != frm->i_forced_type && !(frm->i_forced_type == X264_TYPE_KEYFRAME && IS_X264_TYPE_I( frm->i_type )) ) { x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d was changed to frame type (%d)\n", frm->i_forced_type, frm->i_frame, frm->i_type ); } if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL && brefs == h->param.i_bframe_pyramid ) { frm->i_type = X264_TYPE_B; x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s \n", frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid] ); } /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available. smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */ else if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid == X264_B_PYRAMID_NORMAL && brefs && h->param.i_frame_reference <= (brefs+3) ) { frm->i_type = X264_TYPE_B; x264_log( h, X264_LOG_WARNING, "B-ref at frame %d incompatible with B-pyramid %s and %d reference frames\n", frm->i_frame, x264_b_pyramid_names[h->param.i_bframe_pyramid], h->param.i_frame_reference ); } if( frm->i_type == X264_TYPE_KEYFRAME ) frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; /* Limit GOP size */ if( (!h->param.b_intra_refresh || frm->i_frame == 0) && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_max ) { if( frm->i_type == X264_TYPE_AUTO || frm->i_type == X264_TYPE_I ) frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR; int warn = frm->i_type != X264_TYPE_IDR; if( warn && h->param.b_open_gop ) warn &= frm->i_type != X264_TYPE_I; if( warn ) { x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame ); frm->i_type = h->param.b_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR; } } if( frm->i_type == X264_TYPE_I && frm->i_frame - h->lookahead->i_last_keyframe >= h->param.i_keyint_min ) { if( h->param.b_open_gop ) { h->lookahead->i_last_keyframe = frm->i_frame; // Use display order if( h->param.b_bluray_compat ) h->lookahead->i_last_keyframe -= bframes; // Use bluray order frm->b_keyframe = 1; } else frm->i_type = X264_TYPE_IDR; } if( frm->i_type == X264_TYPE_IDR ) { /* Close GOP */ h->lookahead->i_last_keyframe = frm->i_frame; frm->b_keyframe = 1; if( bframes > 0 ) { bframes--; h->lookahead->next.list[bframes]->i_type = X264_TYPE_P; } } if( bframes == h->param.i_bframe || !h->lookahead->next.list[bframes+1] ) { if( IS_X264_TYPE_B( frm->i_type ) ) x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" ); if( frm->i_type == X264_TYPE_AUTO || IS_X264_TYPE_B( frm->i_type ) ) frm->i_type = X264_TYPE_P; } if( frm->i_type == X264_TYPE_BREF ) brefs++; if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_B; else if( !IS_X264_TYPE_B( frm->i_type ) ) break; } if( bframes ) h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1; h->lookahead->next.list[bframes]->i_bframes = bframes; /* insert a bref into the sequence */ if( h->param.i_bframe_pyramid && bframes > 1 && !brefs ) { h->lookahead->next.list[(bframes-1)/2]->i_type = X264_TYPE_BREF; brefs++; } /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */ if( h->param.rc.i_rc_method != X264_RC_CQP ) { x264_mb_analysis_t a; int p0, p1, b; p1 = b = bframes + 1; lowres_context_init( h, &a ); frames[0] = h->lookahead->last_nonb; memcpy( &frames[1], h->lookahead->next.list, (bframes+1) * sizeof(x264_frame_t*) ); if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) ) p0 = bframes + 1; else // P p0 = 0; slicetype_frame_cost( h, &a, frames, p0, p1, b ); if( (p0 != p1 || bframes) && h->param.rc.i_vbv_buffer_size ) { /* We need the intra costs for row SATDs. */ slicetype_frame_cost( h, &a, frames, b, b, b ); /* We need B-frame costs for row SATDs. */ p0 = 0; for( b = 1; b <= bframes; b++ ) { if( frames[b]->i_type == X264_TYPE_B ) for( p1 = b; frames[p1]->i_type == X264_TYPE_B; ) p1++; else p1 = bframes + 1; slicetype_frame_cost( h, &a, frames, p0, p1, b ); if( frames[b]->i_type == X264_TYPE_BREF ) p0 = b; } } } /* Analyse for weighted P frames */ if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE ) { x264_emms(); x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 ); } /* shift sequence to coded order. use a small temporary list to avoid shifting the entire next buffer around */ int i_coded = h->lookahead->next.list[0]->i_frame; if( bframes ) { int idx_list[] = { brefs+1, 1 }; for( int i = 0; i < bframes; i++ ) { int idx = idx_list[h->lookahead->next.list[i]->i_type == X264_TYPE_BREF]++; frames[idx] = h->lookahead->next.list[i]; frames[idx]->i_reordered_pts = h->lookahead->next.list[idx]->i_pts; } frames[0] = h->lookahead->next.list[bframes]; frames[0]->i_reordered_pts = h->lookahead->next.list[0]->i_pts; memcpy( h->lookahead->next.list, frames, (bframes+1) * sizeof(x264_frame_t*) ); } for( int i = 0; i <= bframes; i++ ) { h->lookahead->next.list[i]->i_coded = i_coded++; if( i ) { calculate_durations( h, h->lookahead->next.list[i], h->lookahead->next.list[i-1], &h->i_cpb_delay, &h->i_coded_fields ); h->lookahead->next.list[0]->f_planned_cpb_duration[i-1] = (double)h->lookahead->next.list[i]->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; } else calculate_durations( h, h->lookahead->next.list[i], NULL, &h->i_cpb_delay, &h->i_coded_fields ); } } int x264_rc_analyse_slice( x264_t *h ) { int p0 = 0, p1, b; int cost; x264_emms(); if( IS_X264_TYPE_I(h->fenc->i_type) ) p1 = b = 0; else if( h->fenc->i_type == X264_TYPE_P ) p1 = b = h->fenc->i_bframes + 1; else //B { p1 = (h->fref_nearest[1]->i_poc - h->fref_nearest[0]->i_poc)/2; b = (h->fenc->i_poc - h->fref_nearest[0]->i_poc)/2; } /* We don't need to assign p0/p1 since we are not performing any real analysis here. */ x264_frame_t **frames = &h->fenc - b; /* cost should have been already calculated by x264_slicetype_decide */ cost = frames[b]->i_cost_est[b-p0][p1-b]; assert( cost >= 0 ); if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read ) { cost = slicetype_frame_cost_recalculate( h, frames, p0, p1, b ); if( b && h->param.rc.i_vbv_buffer_size ) slicetype_frame_cost_recalculate( h, frames, b, b, b ); } /* In AQ, use the weighted score instead. */ else if( h->param.rc.i_aq_mode ) cost = frames[b]->i_cost_est_aq[b-p0][p1-b]; h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b]; h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b]; h->fdec->i_satd = cost; memcpy( h->fdec->i_row_satd, h->fenc->i_row_satd, h->mb.i_mb_height * sizeof(int) ); if( !IS_X264_TYPE_I(h->fenc->i_type) ) memcpy( h->fdec->i_row_satds[0][0], h->fenc->i_row_satds[0][0], h->mb.i_mb_height * sizeof(int) ); if( h->param.b_intra_refresh && h->param.rc.i_vbv_buffer_size && h->fenc->i_type == X264_TYPE_P ) { int ip_factor = 256 * h->param.rc.f_ip_factor; /* fix8 */ for( int y = 0; y < h->mb.i_mb_height; y++ ) { int mb_xy = y * h->mb.i_mb_stride + h->fdec->i_pir_start_col; for( int x = h->fdec->i_pir_start_col; x <= h->fdec->i_pir_end_col; x++, mb_xy++ ) { int intra_cost = (h->fenc->i_intra_cost[mb_xy] * ip_factor + 128) >> 8; int inter_cost = h->fenc->lowres_costs[b-p0][p1-b][mb_xy] & LOWRES_COST_MASK; int diff = intra_cost - inter_cost; if( h->param.rc.i_aq_mode ) h->fdec->i_row_satd[y] += (diff * frames[b]->i_inv_qscale_factor[mb_xy] + 128) >> 8; else h->fdec->i_row_satd[y] += diff; cost += diff; } } } return cost; } x264-master/example.c000066400000000000000000000101521502133446700146550ustar00rootroot00000000000000/***************************************************************************** * example.c: libx264 API usage example ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifdef _WIN32 #include /* _setmode() */ #include /* _O_BINARY */ #endif #include #include #include #define FAIL_IF_ERROR( cond, ... )\ do\ {\ if( cond )\ {\ fprintf( stderr, __VA_ARGS__ );\ goto fail;\ }\ } while( 0 ) int main( int argc, char **argv ) { int width, height; x264_param_t param; x264_picture_t pic; x264_picture_t pic_out; x264_t *h; int i_frame = 0; int i_frame_size; x264_nal_t *nal; int i_nal; #ifdef _WIN32 _setmode( _fileno( stdin ), _O_BINARY ); _setmode( _fileno( stdout ), _O_BINARY ); _setmode( _fileno( stderr ), _O_BINARY ); #endif FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 output.h264\n" ); FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" ); /* Get default params for preset/tuning */ if( x264_param_default_preset( ¶m, "medium", NULL ) < 0 ) goto fail; /* Configure non-default params */ param.i_bitdepth = 8; param.i_csp = X264_CSP_I420; param.i_width = width; param.i_height = height; param.b_vfr_input = 0; param.b_repeat_headers = 1; param.b_annexb = 1; /* Apply profile restrictions. */ if( x264_param_apply_profile( ¶m, "high" ) < 0 ) goto fail; if( x264_picture_alloc( &pic, param.i_csp, param.i_width, param.i_height ) < 0 ) goto fail; #undef fail #define fail fail2 h = x264_encoder_open( ¶m ); if( !h ) goto fail; #undef fail #define fail fail3 int luma_size = width * height; int chroma_size = luma_size / 4; /* Encode frames */ for( ;; i_frame++ ) { /* Read input frame */ if( fread( pic.img.plane[0], 1, luma_size, stdin ) != (unsigned)luma_size ) break; if( fread( pic.img.plane[1], 1, chroma_size, stdin ) != (unsigned)chroma_size ) break; if( fread( pic.img.plane[2], 1, chroma_size, stdin ) != (unsigned)chroma_size ) break; pic.i_pts = i_frame; i_frame_size = x264_encoder_encode( h, &nal, &i_nal, &pic, &pic_out ); if( i_frame_size < 0 ) goto fail; else if( i_frame_size ) { if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) ) goto fail; } } /* Flush delayed frames */ while( x264_encoder_delayed_frames( h ) ) { i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out ); if( i_frame_size < 0 ) goto fail; else if( i_frame_size ) { if( !fwrite( nal->p_payload, i_frame_size, 1, stdout ) ) goto fail; } } x264_encoder_close( h ); x264_picture_clean( &pic ); return 0; #undef fail fail3: x264_encoder_close( h ); fail2: x264_picture_clean( &pic ); fail: return -1; } x264-master/extras/000077500000000000000000000000001502133446700143655ustar00rootroot00000000000000x264-master/extras/avisynth_c.h000066400000000000000000002133601502133446700167120ustar00rootroot00000000000000// Avisynth C Interface Version 0.20 // Copyright 2003 Kevin Atkinson // Copyright 2020 AviSynth+ project // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, // MA 02110-1301 USA, or visit // http://www.gnu.org/copyleft/gpl.html . // // As a special exception, I give you permission to link to the // Avisynth C interface with independent modules that communicate with // the Avisynth C interface solely through the interfaces defined in // avisynth_c.h, regardless of the license terms of these independent // modules, and to copy and distribute the resulting combined work // under terms of your choice, provided that every copy of the // combined work is accompanied by a complete copy of the source code // of the Avisynth C interface and Avisynth itself (with the version // used to produce the combined work), being distributed under the // terms of the GNU General Public License plus this exception. An // independent module is a module which is not derived from or based // on Avisynth C Interface, such as 3rd-party filters, import and // export plugins, or graphical user interfaces. // NOTE: this is a partial update of the Avisynth C interface to recognize // new color spaces and interface elements added in Avisynth 2.60 and AviSynth+. // This interface is not 100% Avisynth+ CPP interface equivalent. // 170103: added new CPU constants (FMA4, AVX512xx) // 171102: define SIZETMOD. do not use yet, experimental. Offsets are size_t instead of int. Affects x64. // 171106: avs_get_row_size calls into avs_get_row_size_p, instead of direct field access // 171106: avs_get_height calls into avs_get_row_size_p, instead of direct field access // 180524: AVSC_EXPORT to dllexport in capi.h for avisynth_c_plugin_init // 180524: avs_is_same_colorspace VideoInfo parameters to const // 181230: Readability: functions regrouped to mix less AVSC_API and AVSC_INLINE, put together Avisynth+ specific stuff // 181230: use #ifndef AVSC_NO_DECLSPEC for AVSC_INLINE functions which are calling API functions // 181230: comments on avs_load_library (helper for loading API entries dynamically into a struct using AVSC_NO_DECLSPEC define) // 181230: define alias AVS_FRAME_ALIGN as FRAME_ALIGN // 181230: remove unused form of avs_get_rowsize and avs_get_height (kept earlier for reference) // 190104: avs_load_library: smart fallback mechanism for Avisynth+ specific functions: // if they are not loadable, they will work in a classic Avisynth compatible mode // Example#1: e.g. avs_is_444 will call the existing avs_is_yv24 instead // Example#2: avs_bits_per_component will return 8 for all colorspaces (Classic Avisynth supports only 8 bits/pixel) // Thus the Avisynth+ specific API functions are safely callable even when connected to classic Avisynth DLL // 202002xx non-Windows friendly additions // 20200305 avs_vsprintf parameter type change: (void *) to va_list // 20200330: (remove test SIZETMOD define for clarity) // 20200513: user must use explicite #define AVS26_FALLBACK_SIMULATION for having fallback helpers in dynamic loaded library section // 20200513: Follow AviSynth+ V8 interface additions // AVS_VideoFrame struct extended with placeholder for frame property pointer // avs_subframe_planar_a // avs_copy_frame_props // avs_get_frame_props_ro, avs_get_frame_props_rw // avs_prop_num_keys, avs_prop_get_key, avs_prop_num_elements, avs_prop_get_type, avs_prop_get_data_size // avs_prop_get_int, avs_prop_get_float, avs_prop_get_data, avs_prop_get_clip, avs_prop_get_frame, avs_prop_get_int_array, avs_prop_get_float_array // avs_prop_set_int, avs_prop_set_float, avs_prop_set_data, avs_prop_set_clip, avs_prop_set_frame, avs_prop_set_int_array, avs_prop_set_float_array // avs_prop_delete_key, avs_clear_map // avs_new_video_frame_p, avs_new_video_frame_p_a // avs_get_env_property (internal system properties), AVS_AEP_xxx (AvsEnvProperty) enums // avs_get_var_try, avs_get_var_bool, avs_get_var_int, avs_get_var_double, avs_get_var_string, avs_get_var_long // avs_pool_allocate, avs_pool_free #ifndef __AVISYNTH_C__ #define __AVISYNTH_C__ #ifndef AVS_CONFIG_H #define AVS_CONFIG_H // Undefine this to get cdecl calling convention #define AVSC_USE_STDCALL 1 // NOTE TO PLUGIN AUTHORS: // Because AVS_FRAME_ALIGN can be substantially higher than the alignment // a plugin actually needs, plugins should not use AVS_FRAME_ALIGN to check for // alignment. They should always request the exact alignment value they need. // This is to make sure that plugins work over the widest range of AviSynth // builds possible. #define AVS_FRAME_ALIGN 64 #if defined(_M_AMD64) || defined(__x86_64) # define AVS_X86_64 #elif defined(_M_IX86) || defined(__i386__) # define AVS_X86_32 // VS2017 introduced _M_ARM64 #elif defined(_M_ARM64) || defined(__aarch64__) # define AVS_ARM64 #elif defined(_M_ARM) || defined(__arm__) # define AVS_ARM32 #elif defined(__PPC64__) # define AVS_PPC64 #elif defined(_M_PPC) || defined(__PPC__) || defined(__POWERPC__) # define AVS_PPC32 #elif defined(__riscv) # define AVS_RISCV #elif defined(__sparc_v9__) # define AVS_SPARC #endif // VC++ LLVM-Clang-cl MinGW-Gnu // AVS_MSVC x x // AVS_MSVC_PURE x // AVS_CLANG x // AVS_GCC x #if defined(__clang__) // Check clang first. clang-cl also defines __MSC_VER // We set AVS_MSVC because they are mostly compatible # define AVS_CLANG #if defined(_MSC_VER) # define AVS_MSVC #endif #elif defined(_MSC_VER) # define AVS_MSVC # define AVS_MSVC_PURE #elif defined(__GNUC__) # define AVS_GCC #endif #if defined(_WIN32) || defined(__CYGWIN__) # define AVS_WINDOWS #elif defined(__linux__) # define AVS_LINUX # define AVS_POSIX #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) # define AVS_BSD # define AVS_POSIX #elif defined(__APPLE__) # define AVS_MACOS # define AVS_POSIX #elif defined(__HAIKU__) # define AVS_HAIKU # define AVS_POSIX #endif #ifndef AVS_MSVC // GCC and Clang can be used on big endian systems, MSVC can't. # if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ # define AVS_ENDIANNESS "little" # elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ # define AVS_ENDIANNESS "big" # else # define AVS_ENDIANNESS "middle" # endif #else #define AVS_ENDIANNESS "little" #endif #endif //AVS_CONFIG_H #ifndef AVS_CAPI_H #define AVS_CAPI_H #ifdef AVS_POSIX // this is also defined in avs/posix.h #ifndef AVS_HAIKU #define __declspec(x) #endif #endif #ifdef __cplusplus # define EXTERN_C extern "C" #else # define EXTERN_C #endif #ifdef AVS_WINDOWS #ifdef BUILDING_AVSCORE # if defined(AVS_GCC) && defined(AVS_X86_32) # define AVSC_CC # else // MSVC builds and 64-bit GCC # ifndef AVSC_USE_STDCALL # define AVSC_CC __cdecl # else # define AVSC_CC __stdcall # endif # endif #else // needed for programs that talk to AviSynth+ # ifndef AVSC_WIN32_GCC32 // see comment below # ifndef AVSC_USE_STDCALL # define AVSC_CC __cdecl # else # define AVSC_CC __stdcall # endif # else # define AVSC_CC # endif #endif # else # define AVSC_CC #endif // On 64-bit Windows, there's only one calling convention, // so there is no difference between MSVC and GCC. On 32-bit, // this isn't true. The convention that GCC needs to use to // even build AviSynth+ as 32-bit makes anything that uses // it incompatible with 32-bit MSVC builds of AviSynth+. // The AVSC_WIN32_GCC32 define is meant to provide a user // switchable way to make builds of FFmpeg to test 32-bit // GCC builds of AviSynth+ without having to screw around // with alternate headers, while still default to the usual // situation of using 32-bit MSVC builds of AviSynth+. // Hopefully, this situation will eventually be resolved // and a broadly compatible solution will arise so the // same 32-bit FFmpeg build can handle either MSVC or GCC // builds of AviSynth+. #define AVSC_INLINE static __inline #ifdef BUILDING_AVSCORE #ifdef AVS_WINDOWS # ifndef AVS_STATIC_LIB # define AVSC_EXPORT __declspec(dllexport) # else # define AVSC_EXPORT # endif # define AVSC_API(ret, name) EXTERN_C AVSC_EXPORT ret AVSC_CC name #else # define AVSC_EXPORT EXTERN_C # define AVSC_API(ret, name) EXTERN_C ret AVSC_CC name #endif #else # define AVSC_EXPORT EXTERN_C __declspec(dllexport) # ifndef AVS_STATIC_LIB # define AVSC_IMPORT __declspec(dllimport) # else # define AVSC_IMPORT # endif # ifndef AVSC_NO_DECLSPEC # define AVSC_API(ret, name) EXTERN_C AVSC_IMPORT ret AVSC_CC name # else # define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func) # endif #endif #endif //AVS_CAPI_H #ifndef AVS_TYPES_H #define AVS_TYPES_H // Define all types necessary for interfacing with avisynth.dll #include // Raster types used by VirtualDub & Avisynth typedef uint8_t BYTE; #endif //AVS_TYPES_H ///////////////////////////////////////////////////////////////////// // // Constants // #ifndef __AVISYNTH_9_H__ enum { AVISYNTH_INTERFACE_CLASSIC_VERSION = 6, AVISYNTH_INTERFACE_VERSION = 9, AVISYNTHPLUS_INTERFACE_BUGFIX_VERSION = 0 // reset to zero whenever the normal interface version bumps }; #endif enum {AVS_SAMPLE_INT8 = 1<<0, AVS_SAMPLE_INT16 = 1<<1, AVS_SAMPLE_INT24 = 1<<2, AVS_SAMPLE_INT32 = 1<<3, AVS_SAMPLE_FLOAT = 1<<4}; enum {AVS_PLANAR_Y=1<<0, AVS_PLANAR_U=1<<1, AVS_PLANAR_V=1<<2, AVS_PLANAR_ALIGNED=1<<3, AVS_PLANAR_Y_ALIGNED=AVS_PLANAR_Y|AVS_PLANAR_ALIGNED, AVS_PLANAR_U_ALIGNED=AVS_PLANAR_U|AVS_PLANAR_ALIGNED, AVS_PLANAR_V_ALIGNED=AVS_PLANAR_V|AVS_PLANAR_ALIGNED, AVS_PLANAR_A=1<<4, AVS_PLANAR_R=1<<5, AVS_PLANAR_G=1<<6, AVS_PLANAR_B=1<<7, AVS_PLANAR_A_ALIGNED=AVS_PLANAR_A|AVS_PLANAR_ALIGNED, AVS_PLANAR_R_ALIGNED=AVS_PLANAR_R|AVS_PLANAR_ALIGNED, AVS_PLANAR_G_ALIGNED=AVS_PLANAR_G|AVS_PLANAR_ALIGNED, AVS_PLANAR_B_ALIGNED=AVS_PLANAR_B|AVS_PLANAR_ALIGNED}; // Colorspace properties. enum { AVS_CS_YUVA = 1 << 27, AVS_CS_BGR = 1 << 28, AVS_CS_YUV = 1 << 29, AVS_CS_INTERLEAVED = 1 << 30, AVS_CS_PLANAR = 1 << 31, AVS_CS_SHIFT_SUB_WIDTH = 0, AVS_CS_SHIFT_SUB_HEIGHT = 8, AVS_CS_SHIFT_SAMPLE_BITS = 16, AVS_CS_SUB_WIDTH_MASK = 7 << AVS_CS_SHIFT_SUB_WIDTH, AVS_CS_SUB_WIDTH_1 = 3 << AVS_CS_SHIFT_SUB_WIDTH, // YV24 AVS_CS_SUB_WIDTH_2 = 0 << AVS_CS_SHIFT_SUB_WIDTH, // YV12, I420, YV16 AVS_CS_SUB_WIDTH_4 = 1 << AVS_CS_SHIFT_SUB_WIDTH, // YUV9, YV411 AVS_CS_VPLANEFIRST = 1 << 3, // YV12, YV16, YV24, YV411, YUV9 AVS_CS_UPLANEFIRST = 1 << 4, // I420 AVS_CS_SUB_HEIGHT_MASK = 7 << AVS_CS_SHIFT_SUB_HEIGHT, AVS_CS_SUB_HEIGHT_1 = 3 << AVS_CS_SHIFT_SUB_HEIGHT, // YV16, YV24, YV411 AVS_CS_SUB_HEIGHT_2 = 0 << AVS_CS_SHIFT_SUB_HEIGHT, // YV12, I420 AVS_CS_SUB_HEIGHT_4 = 1 << AVS_CS_SHIFT_SUB_HEIGHT, // YUV9 AVS_CS_SAMPLE_BITS_MASK = 7 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_SAMPLE_BITS_8 = 0 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_SAMPLE_BITS_10 = 5 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_SAMPLE_BITS_12 = 6 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_SAMPLE_BITS_14 = 7 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_SAMPLE_BITS_16 = 1 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_SAMPLE_BITS_32 = 2 << AVS_CS_SHIFT_SAMPLE_BITS, AVS_CS_PLANAR_MASK = AVS_CS_PLANAR | AVS_CS_INTERLEAVED | AVS_CS_YUV | AVS_CS_BGR | AVS_CS_YUVA | AVS_CS_SAMPLE_BITS_MASK | AVS_CS_SUB_HEIGHT_MASK | AVS_CS_SUB_WIDTH_MASK, AVS_CS_PLANAR_FILTER = ~(AVS_CS_VPLANEFIRST | AVS_CS_UPLANEFIRST), AVS_CS_RGB_TYPE = 1 << 0, AVS_CS_RGBA_TYPE = 1 << 1, AVS_CS_GENERIC_YUV420 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2, // 4:2:0 planar AVS_CS_GENERIC_YUV422 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_2, // 4:2:2 planar AVS_CS_GENERIC_YUV444 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_1, // 4:4:4 planar AVS_CS_GENERIC_Y = AVS_CS_PLANAR | AVS_CS_INTERLEAVED | AVS_CS_YUV, // Y only (4:0:0) AVS_CS_GENERIC_RGBP = AVS_CS_PLANAR | AVS_CS_BGR | AVS_CS_RGB_TYPE, // planar RGB AVS_CS_GENERIC_RGBAP = AVS_CS_PLANAR | AVS_CS_BGR | AVS_CS_RGBA_TYPE, // planar RGBA AVS_CS_GENERIC_YUVA420 = AVS_CS_PLANAR | AVS_CS_YUVA | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2, // 4:2:0:A planar AVS_CS_GENERIC_YUVA422 = AVS_CS_PLANAR | AVS_CS_YUVA | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_2, // 4:2:2:A planar AVS_CS_GENERIC_YUVA444 = AVS_CS_PLANAR | AVS_CS_YUVA | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_1 }; // 4:4:4:A planar // Specific color formats enum { AVS_CS_UNKNOWN = 0, AVS_CS_BGR24 = AVS_CS_RGB_TYPE | AVS_CS_BGR | AVS_CS_INTERLEAVED, AVS_CS_BGR32 = AVS_CS_RGBA_TYPE | AVS_CS_BGR | AVS_CS_INTERLEAVED, AVS_CS_YUY2 = 1<<2 | AVS_CS_YUV | AVS_CS_INTERLEAVED, // AVS_CS_YV12 = 1<<3 Reserved // AVS_CS_I420 = 1<<4 Reserved AVS_CS_RAW32 = 1<<5 | AVS_CS_INTERLEAVED, AVS_CS_YV24 = AVS_CS_GENERIC_YUV444 | AVS_CS_SAMPLE_BITS_8, // YUV 4:4:4 planar AVS_CS_YV16 = AVS_CS_GENERIC_YUV422 | AVS_CS_SAMPLE_BITS_8, // YUV 4:2:2 planar AVS_CS_YV12 = AVS_CS_GENERIC_YUV420 | AVS_CS_SAMPLE_BITS_8, // YUV 4:2:0 planar AVS_CS_I420 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_UPLANEFIRST | AVS_CS_SUB_HEIGHT_2 | AVS_CS_SUB_WIDTH_2, // YUV 4:2:0 planar AVS_CS_IYUV = AVS_CS_I420, AVS_CS_YV411 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_1 | AVS_CS_SUB_WIDTH_4, // YUV 4:1:1 planar AVS_CS_YUV9 = AVS_CS_PLANAR | AVS_CS_YUV | AVS_CS_SAMPLE_BITS_8 | AVS_CS_VPLANEFIRST | AVS_CS_SUB_HEIGHT_4 | AVS_CS_SUB_WIDTH_4, // YUV 4:1:0 planar AVS_CS_Y8 = AVS_CS_GENERIC_Y | AVS_CS_SAMPLE_BITS_8, // Y 4:0:0 planar //------------------------- // AVS16: new planar constants go live! Experimental PF 160613 // 10-12-14-16 bit + planar RGB + BGR48/64 160725 AVS_CS_YUV444P10 = AVS_CS_GENERIC_YUV444 | AVS_CS_SAMPLE_BITS_10, // YUV 4:4:4 10bit samples AVS_CS_YUV422P10 = AVS_CS_GENERIC_YUV422 | AVS_CS_SAMPLE_BITS_10, // YUV 4:2:2 10bit samples AVS_CS_YUV420P10 = AVS_CS_GENERIC_YUV420 | AVS_CS_SAMPLE_BITS_10, // YUV 4:2:0 10bit samples AVS_CS_Y10 = AVS_CS_GENERIC_Y | AVS_CS_SAMPLE_BITS_10, // Y 4:0:0 10bit samples AVS_CS_YUV444P12 = AVS_CS_GENERIC_YUV444 | AVS_CS_SAMPLE_BITS_12, // YUV 4:4:4 12bit samples AVS_CS_YUV422P12 = AVS_CS_GENERIC_YUV422 | AVS_CS_SAMPLE_BITS_12, // YUV 4:2:2 12bit samples AVS_CS_YUV420P12 = AVS_CS_GENERIC_YUV420 | AVS_CS_SAMPLE_BITS_12, // YUV 4:2:0 12bit samples AVS_CS_Y12 = AVS_CS_GENERIC_Y | AVS_CS_SAMPLE_BITS_12, // Y 4:0:0 12bit samples AVS_CS_YUV444P14 = AVS_CS_GENERIC_YUV444 | AVS_CS_SAMPLE_BITS_14, // YUV 4:4:4 14bit samples AVS_CS_YUV422P14 = AVS_CS_GENERIC_YUV422 | AVS_CS_SAMPLE_BITS_14, // YUV 4:2:2 14bit samples AVS_CS_YUV420P14 = AVS_CS_GENERIC_YUV420 | AVS_CS_SAMPLE_BITS_14, // YUV 4:2:0 14bit samples AVS_CS_Y14 = AVS_CS_GENERIC_Y | AVS_CS_SAMPLE_BITS_14, // Y 4:0:0 14bit samples AVS_CS_YUV444P16 = AVS_CS_GENERIC_YUV444 | AVS_CS_SAMPLE_BITS_16, // YUV 4:4:4 16bit samples AVS_CS_YUV422P16 = AVS_CS_GENERIC_YUV422 | AVS_CS_SAMPLE_BITS_16, // YUV 4:2:2 16bit samples AVS_CS_YUV420P16 = AVS_CS_GENERIC_YUV420 | AVS_CS_SAMPLE_BITS_16, // YUV 4:2:0 16bit samples AVS_CS_Y16 = AVS_CS_GENERIC_Y | AVS_CS_SAMPLE_BITS_16, // Y 4:0:0 16bit samples // 32 bit samples (float) AVS_CS_YUV444PS = AVS_CS_GENERIC_YUV444 | AVS_CS_SAMPLE_BITS_32, // YUV 4:4:4 32bit samples AVS_CS_YUV422PS = AVS_CS_GENERIC_YUV422 | AVS_CS_SAMPLE_BITS_32, // YUV 4:2:2 32bit samples AVS_CS_YUV420PS = AVS_CS_GENERIC_YUV420 | AVS_CS_SAMPLE_BITS_32, // YUV 4:2:0 32bit samples AVS_CS_Y32 = AVS_CS_GENERIC_Y | AVS_CS_SAMPLE_BITS_32, // Y 4:0:0 32bit samples // RGB packed AVS_CS_BGR48 = AVS_CS_RGB_TYPE | AVS_CS_BGR | AVS_CS_INTERLEAVED | AVS_CS_SAMPLE_BITS_16, // BGR 3x16 bit AVS_CS_BGR64 = AVS_CS_RGBA_TYPE | AVS_CS_BGR | AVS_CS_INTERLEAVED | AVS_CS_SAMPLE_BITS_16, // BGR 4x16 bit // no packed 32 bit (float) support for these legacy types // RGB planar AVS_CS_RGBP = AVS_CS_GENERIC_RGBP | AVS_CS_SAMPLE_BITS_8, // Planar RGB 8 bit samples AVS_CS_RGBP10 = AVS_CS_GENERIC_RGBP | AVS_CS_SAMPLE_BITS_10, // Planar RGB 10bit samples AVS_CS_RGBP12 = AVS_CS_GENERIC_RGBP | AVS_CS_SAMPLE_BITS_12, // Planar RGB 12bit samples AVS_CS_RGBP14 = AVS_CS_GENERIC_RGBP | AVS_CS_SAMPLE_BITS_14, // Planar RGB 14bit samples AVS_CS_RGBP16 = AVS_CS_GENERIC_RGBP | AVS_CS_SAMPLE_BITS_16, // Planar RGB 16bit samples AVS_CS_RGBPS = AVS_CS_GENERIC_RGBP | AVS_CS_SAMPLE_BITS_32, // Planar RGB 32bit samples // RGBA planar AVS_CS_RGBAP = AVS_CS_GENERIC_RGBAP | AVS_CS_SAMPLE_BITS_8, // Planar RGBA 8 bit samples AVS_CS_RGBAP10 = AVS_CS_GENERIC_RGBAP | AVS_CS_SAMPLE_BITS_10, // Planar RGBA 10bit samples AVS_CS_RGBAP12 = AVS_CS_GENERIC_RGBAP | AVS_CS_SAMPLE_BITS_12, // Planar RGBA 12bit samples AVS_CS_RGBAP14 = AVS_CS_GENERIC_RGBAP | AVS_CS_SAMPLE_BITS_14, // Planar RGBA 14bit samples AVS_CS_RGBAP16 = AVS_CS_GENERIC_RGBAP | AVS_CS_SAMPLE_BITS_16, // Planar RGBA 16bit samples AVS_CS_RGBAPS = AVS_CS_GENERIC_RGBAP | AVS_CS_SAMPLE_BITS_32, // Planar RGBA 32bit samples // Planar YUVA AVS_CS_YUVA444 = AVS_CS_GENERIC_YUVA444 | AVS_CS_SAMPLE_BITS_8, // YUVA 4:4:4 8bit samples AVS_CS_YUVA422 = AVS_CS_GENERIC_YUVA422 | AVS_CS_SAMPLE_BITS_8, // YUVA 4:2:2 8bit samples AVS_CS_YUVA420 = AVS_CS_GENERIC_YUVA420 | AVS_CS_SAMPLE_BITS_8, // YUVA 4:2:0 8bit samples AVS_CS_YUVA444P10 = AVS_CS_GENERIC_YUVA444 | AVS_CS_SAMPLE_BITS_10, // YUVA 4:4:4 10bit samples AVS_CS_YUVA422P10 = AVS_CS_GENERIC_YUVA422 | AVS_CS_SAMPLE_BITS_10, // YUVA 4:2:2 10bit samples AVS_CS_YUVA420P10 = AVS_CS_GENERIC_YUVA420 | AVS_CS_SAMPLE_BITS_10, // YUVA 4:2:0 10bit samples AVS_CS_YUVA444P12 = AVS_CS_GENERIC_YUVA444 | AVS_CS_SAMPLE_BITS_12, // YUVA 4:4:4 12bit samples AVS_CS_YUVA422P12 = AVS_CS_GENERIC_YUVA422 | AVS_CS_SAMPLE_BITS_12, // YUVA 4:2:2 12bit samples AVS_CS_YUVA420P12 = AVS_CS_GENERIC_YUVA420 | AVS_CS_SAMPLE_BITS_12, // YUVA 4:2:0 12bit samples AVS_CS_YUVA444P14 = AVS_CS_GENERIC_YUVA444 | AVS_CS_SAMPLE_BITS_14, // YUVA 4:4:4 14bit samples AVS_CS_YUVA422P14 = AVS_CS_GENERIC_YUVA422 | AVS_CS_SAMPLE_BITS_14, // YUVA 4:2:2 14bit samples AVS_CS_YUVA420P14 = AVS_CS_GENERIC_YUVA420 | AVS_CS_SAMPLE_BITS_14, // YUVA 4:2:0 14bit samples AVS_CS_YUVA444P16 = AVS_CS_GENERIC_YUVA444 | AVS_CS_SAMPLE_BITS_16, // YUVA 4:4:4 16bit samples AVS_CS_YUVA422P16 = AVS_CS_GENERIC_YUVA422 | AVS_CS_SAMPLE_BITS_16, // YUVA 4:2:2 16bit samples AVS_CS_YUVA420P16 = AVS_CS_GENERIC_YUVA420 | AVS_CS_SAMPLE_BITS_16, // YUVA 4:2:0 16bit samples AVS_CS_YUVA444PS = AVS_CS_GENERIC_YUVA444 | AVS_CS_SAMPLE_BITS_32, // YUVA 4:4:4 32bit samples AVS_CS_YUVA422PS = AVS_CS_GENERIC_YUVA422 | AVS_CS_SAMPLE_BITS_32, // YUVA 4:2:2 32bit samples AVS_CS_YUVA420PS = AVS_CS_GENERIC_YUVA420 | AVS_CS_SAMPLE_BITS_32, // YUVA 4:2:0 32bit samples }; enum { AVS_IT_BFF = 1<<0, AVS_IT_TFF = 1<<1, AVS_IT_FIELDBASED = 1<<2}; enum { AVS_FILTER_TYPE=1, AVS_FILTER_INPUT_COLORSPACE=2, AVS_FILTER_OUTPUT_TYPE=9, AVS_FILTER_NAME=4, AVS_FILTER_AUTHOR=5, AVS_FILTER_VERSION=6, AVS_FILTER_ARGS=7, AVS_FILTER_ARGS_INFO=8, AVS_FILTER_ARGS_DESCRIPTION=10, AVS_FILTER_DESCRIPTION=11}; enum { //SUBTYPES AVS_FILTER_TYPE_AUDIO=1, AVS_FILTER_TYPE_VIDEO=2, AVS_FILTER_OUTPUT_TYPE_SAME=3, AVS_FILTER_OUTPUT_TYPE_DIFFERENT=4}; enum { // New 2.6 explicitly defined cache hints. AVS_CACHE_NOTHING = 10, // Do not cache video. AVS_CACHE_WINDOW = 11, // Hard protect up to X frames within a range of X from the current frame N. AVS_CACHE_GENERIC = 12, // LRU cache up to X frames. AVS_CACHE_FORCE_GENERIC = 13, // LRU cache up to X frames, override any previous CACHE_WINDOW. AVS_CACHE_GET_POLICY = 30, // Get the current policy. AVS_CACHE_GET_WINDOW = 31, // Get the current window h_span. AVS_CACHE_GET_RANGE = 32, // Get the current generic frame range. AVS_CACHE_AUDIO = 50, // Explicitly do cache audio, X byte cache. AVS_CACHE_AUDIO_NOTHING = 51, // Explicitly do not cache audio. AVS_CACHE_AUDIO_NONE = 52, // Audio cache off (auto mode), X byte initial cache. AVS_CACHE_AUDIO_AUTO = 53, // Audio cache on (auto mode), X byte initial cache. AVS_CACHE_GET_AUDIO_POLICY = 70, // Get the current audio policy. AVS_CACHE_GET_AUDIO_SIZE = 71, // Get the current audio cache size. AVS_CACHE_PREFETCH_FRAME = 100, // Queue request to prefetch frame N. AVS_CACHE_PREFETCH_GO = 101, // Action video prefetches. AVS_CACHE_PREFETCH_AUDIO_BEGIN = 120, // Begin queue request transaction to prefetch audio (take critical section). AVS_CACHE_PREFETCH_AUDIO_STARTLO = 121, // Set low 32 bits of start. AVS_CACHE_PREFETCH_AUDIO_STARTHI = 122, // Set high 32 bits of start. AVS_CACHE_PREFETCH_AUDIO_COUNT = 123, // Set low 32 bits of length. AVS_CACHE_PREFETCH_AUDIO_COMMIT = 124, // Enqueue request transaction to prefetch audio (release critical section). AVS_CACHE_PREFETCH_AUDIO_GO = 125, // Action audio prefetches. AVS_CACHE_GETCHILD_CACHE_MODE = 200, // Cache ask Child for desired video cache mode. AVS_CACHE_GETCHILD_CACHE_SIZE = 201, // Cache ask Child for desired video cache size. AVS_CACHE_GETCHILD_AUDIO_MODE = 202, // Cache ask Child for desired audio cache mode. AVS_CACHE_GETCHILD_AUDIO_SIZE = 203, // Cache ask Child for desired audio cache size. AVS_CACHE_GETCHILD_COST = 220, // Cache ask Child for estimated processing cost. AVS_CACHE_COST_ZERO = 221, // Child response of zero cost (ptr arithmetic only). AVS_CACHE_COST_UNIT = 222, // Child response of unit cost (less than or equal 1 full frame blit). AVS_CACHE_COST_LOW = 223, // Child response of light cost. (Fast) AVS_CACHE_COST_MED = 224, // Child response of medium cost. (Real time) AVS_CACHE_COST_HI = 225, // Child response of heavy cost. (Slow) AVS_CACHE_GETCHILD_THREAD_MODE = 240, // Cache ask Child for thread safety. AVS_CACHE_THREAD_UNSAFE = 241, // Only 1 thread allowed for all instances. 2.5 filters default! AVS_CACHE_THREAD_CLASS = 242, // Only 1 thread allowed for each instance. 2.6 filters default! AVS_CACHE_THREAD_SAFE = 243, // Allow all threads in any instance. AVS_CACHE_THREAD_OWN = 244, // Safe but limit to 1 thread, internally threaded. AVS_CACHE_GETCHILD_ACCESS_COST = 260, // Cache ask Child for preferred access pattern. AVS_CACHE_ACCESS_RAND = 261, // Filter is access order agnostic. AVS_CACHE_ACCESS_SEQ0 = 262, // Filter prefers sequential access (low cost) AVS_CACHE_ACCESS_SEQ1 = 263, // Filter needs sequential access (high cost) AVS_CACHE_AVSPLUS_CONSTANTS = 500, // Smaller values are reserved for classic Avisynth AVS_CACHE_DONT_CACHE_ME = 501, // Filters that don't need caching (eg. trim, cache etc.) should return 1 to this request AVS_CACHE_SET_MIN_CAPACITY = 502, AVS_CACHE_SET_MAX_CAPACITY = 503, AVS_CACHE_GET_MIN_CAPACITY = 504, AVS_CACHE_GET_MAX_CAPACITY = 505, AVS_CACHE_GET_SIZE = 506, AVS_CACHE_GET_REQUESTED_CAP = 507, AVS_CACHE_GET_CAPACITY = 508, AVS_CACHE_GET_MTMODE = 509, AVS_CACHE_IS_CACHE_REQ = 510, AVS_CACHE_IS_CACHE_ANS = 511, AVS_CACHE_IS_MTGUARD_REQ = 512, AVS_CACHE_IS_MTGUARD_ANS = 513, AVS_CACHE_AVSPLUS_CUDA_CONSTANTS = 600, AVS_CACHE_GET_DEV_TYPE = 601, // Device types a filter can return AVS_CACHE_GET_CHILD_DEV_TYPE = 602, // Device types a fitler can receive AVS_CACHE_USER_CONSTANTS = 1000 // Smaller values are reserved for the core }; // enums for frame property functions // AVSPropTypes enum { AVS_PROPTYPE_UNSET = 'u', AVS_PROPTYPE_INT = 'i', AVS_PROPTYPE_FLOAT = 'f', AVS_PROPTYPE_DATA = 's', AVS_PROPTYPE_CLIP = 'c', AVS_PROPTYPE_FRAME = 'v' }; // AVSGetPropErrors for avs_prop_get_... enum { AVS_GETPROPERROR_UNSET = 1, AVS_GETPROPERROR_TYPE = 2, AVS_GETPROPERROR_INDEX = 4 }; // AVSPropAppendMode for avs_prop_set_... enum { AVS_PROPAPPENDMODE_REPLACE = 0, AVS_PROPAPPENDMODE_APPEND = 1, AVS_PROPAPPENDMODE_TOUCH = 2 }; // AvsEnvProperty for avs_get_env_property enum { AVS_AEP_PHYSICAL_CPUS = 1, AVS_AEP_LOGICAL_CPUS = 2, AVS_AEP_THREADPOOL_THREADS = 3, AVS_AEP_FILTERCHAIN_THREADS = 4, AVS_AEP_THREAD_ID = 5, AVS_AEP_VERSION = 6, AVS_AEP_HOST_SYSTEM_ENDIANNESS = 7, AVS_AEP_INTERFACE_VERSION = 8, AVS_AEP_INTERFACE_BUGFIX = 9, // Neo additionals AVS_AEP_NUM_DEVICES = 901, AVS_AEP_FRAME_ALIGN = 902, AVS_AEP_PLANE_ALIGN = 903, AVS_AEP_SUPPRESS_THREAD = 921, AVS_AEP_GETFRAME_RECURSIVE = 922 }; // enum AvsAllocType for avs_allocate enum { AVS_ALLOCTYPE_NORMAL_ALLOC = 1, AVS_ALLOCTYPE_POOLED_ALLOC = 2 }; #ifdef BUILDING_AVSCORE AVSValue create_c_video_filter(AVSValue args, void * user_data, IScriptEnvironment * e0); struct AVS_ScriptEnvironment { IScriptEnvironment * env; const char * error; AVS_ScriptEnvironment(IScriptEnvironment * e = 0) : env(e), error(0) {} }; #endif typedef struct AVS_Clip AVS_Clip; typedef struct AVS_ScriptEnvironment AVS_ScriptEnvironment; ///////////////////////////////////////////////////////////////////// // // AVS_VideoInfo // // AVS_VideoInfo is laid out identically to VideoInfo typedef struct AVS_VideoInfo { int width, height; // width=0 means no video unsigned fps_numerator, fps_denominator; int num_frames; int pixel_type; int audio_samples_per_second; // 0 means no audio int sample_type; int64_t num_audio_samples; int nchannels; // Image type properties int image_type; } AVS_VideoInfo; // useful functions of the above AVSC_INLINE int avs_has_video(const AVS_VideoInfo * p) { return (p->width!=0); } AVSC_INLINE int avs_has_audio(const AVS_VideoInfo * p) { return (p->audio_samples_per_second!=0); } AVSC_INLINE int avs_is_rgb(const AVS_VideoInfo * p) { return !!(p->pixel_type&AVS_CS_BGR); } AVSC_INLINE int avs_is_rgb24(const AVS_VideoInfo * p) { return ((p->pixel_type&AVS_CS_BGR24)==AVS_CS_BGR24) && ((p->pixel_type & AVS_CS_SAMPLE_BITS_MASK) == AVS_CS_SAMPLE_BITS_8); } AVSC_INLINE int avs_is_rgb32(const AVS_VideoInfo * p) { return ((p->pixel_type&AVS_CS_BGR32)==AVS_CS_BGR32) && ((p->pixel_type & AVS_CS_SAMPLE_BITS_MASK) == AVS_CS_SAMPLE_BITS_8); } AVSC_INLINE int avs_is_yuv(const AVS_VideoInfo * p) { return !!(p->pixel_type&AVS_CS_YUV ); } AVSC_INLINE int avs_is_yuy2(const AVS_VideoInfo * p) { return (p->pixel_type & AVS_CS_YUY2) == AVS_CS_YUY2; } AVSC_API(int, avs_is_yv24)(const AVS_VideoInfo * p); // avs+: for generic 444 check, use avs_is_yuv444 AVSC_API(int, avs_is_yv16)(const AVS_VideoInfo * p); // avs+: for generic 422 check, use avs_is_yuv422 AVSC_API(int, avs_is_yv12)(const AVS_VideoInfo * p) ; // avs+: for generic 420 check, use avs_is_yuv420 AVSC_API(int, avs_is_yv411)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_y8)(const AVS_VideoInfo * p); // avs+: for generic grayscale, use avs_is_y #ifdef AVSC_NO_DECLSPEC AVSC_INLINE int avs_is_yv24(const AVS_VideoInfo * p) { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV24 & AVS_CS_PLANAR_FILTER); } AVSC_INLINE int avs_is_yv16(const AVS_VideoInfo * p) { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV16 & AVS_CS_PLANAR_FILTER); } AVSC_INLINE int avs_is_yv12(const AVS_VideoInfo * p) { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV12 & AVS_CS_PLANAR_FILTER); } AVSC_INLINE int avs_is_yv411(const AVS_VideoInfo * p) { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_YV411 & AVS_CS_PLANAR_FILTER); } AVSC_INLINE int avs_is_y8(const AVS_VideoInfo * p) { return (p->pixel_type & AVS_CS_PLANAR_MASK) == (AVS_CS_Y8 & AVS_CS_PLANAR_FILTER); } #endif AVSC_API(int, avs_get_plane_width_subsampling)(const AVS_VideoInfo * p, int plane); AVSC_API(int, avs_get_plane_height_subsampling)(const AVS_VideoInfo * p, int plane); AVSC_API(int, avs_bits_per_pixel)(const AVS_VideoInfo * p); AVSC_API(int, avs_bytes_from_pixels)(const AVS_VideoInfo * p, int pixels); AVSC_API(int, avs_row_size)(const AVS_VideoInfo * p, int plane); AVSC_API(int, avs_bmp_size)(const AVS_VideoInfo * vi); AVSC_API(int, avs_is_color_space)(const AVS_VideoInfo * p, int c_space); // no API for these, inline helper functions AVSC_INLINE int avs_is_property(const AVS_VideoInfo * p, int property) { return ((p->image_type & property) == property); } AVSC_INLINE int avs_is_planar(const AVS_VideoInfo * p) { return !!(p->pixel_type & AVS_CS_PLANAR); } AVSC_INLINE int avs_is_field_based(const AVS_VideoInfo * p) { return !!(p->image_type & AVS_IT_FIELDBASED); } AVSC_INLINE int avs_is_parity_known(const AVS_VideoInfo * p) { return ((p->image_type & AVS_IT_FIELDBASED) && (p->image_type & (AVS_IT_BFF | AVS_IT_TFF))); } AVSC_INLINE int avs_is_bff(const AVS_VideoInfo * p) { return !!(p->image_type & AVS_IT_BFF); } AVSC_INLINE int avs_is_tff(const AVS_VideoInfo * p) { return !!(p->image_type & AVS_IT_TFF); } AVSC_INLINE int avs_samples_per_second(const AVS_VideoInfo * p) { return p->audio_samples_per_second; } AVSC_INLINE int avs_bytes_per_channel_sample(const AVS_VideoInfo * p) { switch (p->sample_type) { case AVS_SAMPLE_INT8: return sizeof(signed char); case AVS_SAMPLE_INT16: return sizeof(signed short); case AVS_SAMPLE_INT24: return 3; case AVS_SAMPLE_INT32: return sizeof(signed int); case AVS_SAMPLE_FLOAT: return sizeof(float); default: return 0; } } AVSC_INLINE int avs_bytes_per_audio_sample(const AVS_VideoInfo * p) { return p->nchannels*avs_bytes_per_channel_sample(p); } AVSC_INLINE int64_t avs_audio_samples_from_frames(const AVS_VideoInfo * p, int64_t frames) { return ((int64_t)(frames) * p->audio_samples_per_second * p->fps_denominator / p->fps_numerator); } AVSC_INLINE int avs_frames_from_audio_samples(const AVS_VideoInfo * p, int64_t samples) { return (int)(samples * (int64_t)p->fps_numerator / (int64_t)p->fps_denominator / (int64_t)p->audio_samples_per_second); } AVSC_INLINE int64_t avs_audio_samples_from_bytes(const AVS_VideoInfo * p, int64_t bytes) { return bytes / avs_bytes_per_audio_sample(p); } AVSC_INLINE int64_t avs_bytes_from_audio_samples(const AVS_VideoInfo * p, int64_t samples) { return samples * avs_bytes_per_audio_sample(p); } AVSC_INLINE int avs_audio_channels(const AVS_VideoInfo * p) { return p->nchannels; } AVSC_INLINE int avs_sample_type(const AVS_VideoInfo * p) { return p->sample_type; } // useful mutator // Note: these are video format properties, neither frame properties, nor system properties AVSC_INLINE void avs_set_property(AVS_VideoInfo * p, int property) { p->image_type|=property; } AVSC_INLINE void avs_clear_property(AVS_VideoInfo * p, int property) { p->image_type&=~property; } AVSC_INLINE void avs_set_field_based(AVS_VideoInfo * p, int isfieldbased) { if (isfieldbased) p->image_type|=AVS_IT_FIELDBASED; else p->image_type&=~AVS_IT_FIELDBASED; } AVSC_INLINE void avs_set_fps(AVS_VideoInfo * p, unsigned numerator, unsigned denominator) { unsigned x=numerator, y=denominator; while (y) { // find gcd unsigned t = x%y; x = y; y = t; } p->fps_numerator = numerator/x; p->fps_denominator = denominator/x; } AVSC_INLINE int avs_is_same_colorspace(const AVS_VideoInfo * x, const AVS_VideoInfo * y) { return (x->pixel_type == y->pixel_type) || (avs_is_yv12(x) && avs_is_yv12(y)); } // AviSynth+ extensions AVSC_API(int, avs_is_rgb48)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_rgb64)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_yuv444p16)(const AVS_VideoInfo * p); // deprecated, use avs_is_yuv444 AVSC_API(int, avs_is_yuv422p16)(const AVS_VideoInfo * p); // deprecated, use avs_is_yuv422 AVSC_API(int, avs_is_yuv420p16)(const AVS_VideoInfo * p); // deprecated, use avs_is_yuv420 AVSC_API(int, avs_is_y16)(const AVS_VideoInfo * p); // deprecated, use avs_is_y AVSC_API(int, avs_is_yuv444ps)(const AVS_VideoInfo * p); // deprecated, use avs_is_yuv444 AVSC_API(int, avs_is_yuv422ps)(const AVS_VideoInfo * p); // deprecated, use avs_is_yuv422 AVSC_API(int, avs_is_yuv420ps)(const AVS_VideoInfo * p); // deprecated, use avs_is_yuv420 AVSC_API(int, avs_is_y32)(const AVS_VideoInfo * p); // deprecated, use avs_is_y AVSC_API(int, avs_is_444)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_422)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_420)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_y)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_yuva)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_planar_rgb)(const AVS_VideoInfo * p); AVSC_API(int, avs_is_planar_rgba)(const AVS_VideoInfo * p); AVSC_API(int, avs_num_components)(const AVS_VideoInfo * p); AVSC_API(int, avs_component_size)(const AVS_VideoInfo * p); AVSC_API(int, avs_bits_per_component)(const AVS_VideoInfo * p); // end of Avisynth+ specific ///////////////////////////////////////////////////////////////////// // // AVS_VideoFrame // // VideoFrameBuffer holds information about a memory block which is used // for video data. For efficiency, instances of this class are not deleted // when the refcount reaches zero; instead they're stored in a linked list // to be reused. The instances are deleted when the corresponding AVS // file is closed. // AVS_VideoFrameBuffer is laid out identically to VideoFrameBuffer // DO NOT USE THIS STRUCTURE DIRECTLY typedef struct AVS_VideoFrameBuffer { BYTE * data; int data_size; // sequence_number is incremented every time the buffer is changed, so // that stale views can tell they're no longer valid. volatile long sequence_number; volatile long refcount; void* device; // avs+ } AVS_VideoFrameBuffer; // VideoFrame holds a "window" into a VideoFrameBuffer. // AVS_VideoFrame is laid out identically to IVideoFrame // DO NOT USE THIS STRUCTURE DIRECTLY typedef struct AVS_VideoFrame { volatile long refcount; AVS_VideoFrameBuffer * vfb; int offset; int pitch, row_size, height; int offsetU, offsetV; int pitchUV; // U&V offsets are from top of picture. int row_sizeUV, heightUV; // for Planar RGB offsetU, offsetV is for the 2nd and 3rd Plane. // for Planar RGB pitchUV and row_sizeUV = 0, because when no VideoInfo (MakeWriteable) // the decision on existence of UV is checked by zero pitch // AVS+ extension, avisynth.h: class does not break plugins if appended here int offsetA; int pitchA, row_sizeA; // 4th alpha plane support, pitch and row_size is 0 is none void* properties; // frame properties } AVS_VideoFrame; // Access functions for AVS_VideoFrame AVSC_API(int, avs_get_pitch_p)(const AVS_VideoFrame * p, int plane); AVSC_API(int, avs_get_row_size_p)(const AVS_VideoFrame * p, int plane); AVSC_API(int, avs_get_height_p)(const AVS_VideoFrame * p, int plane); AVSC_API(const BYTE *, avs_get_read_ptr_p)(const AVS_VideoFrame * p, int plane); #ifdef AVSC_NO_DECLSPEC AVSC_INLINE int avs_get_pitch_p(const AVS_VideoFrame * p, int plane) { switch (plane) { case AVS_PLANAR_U: case AVS_PLANAR_V: return p->pitchUV; case AVS_PLANAR_A: return p->pitchA; } return p->pitch; // Y, G, B, R } AVSC_INLINE int avs_get_row_size_p(const AVS_VideoFrame * p, int plane) { switch (plane) { case AVS_PLANAR_U: case AVS_PLANAR_V: return (p->pitchUV) ? p->row_sizeUV : 0; case AVS_PLANAR_A: return (p->pitchA) ? p->row_sizeA : 0; } return p->row_size; } AVSC_INLINE int avs_get_height_p(const AVS_VideoFrame * p, int plane) { switch (plane) { case AVS_PLANAR_U: case AVS_PLANAR_V: return (p->pitchUV) ? p->heightUV : 0; case AVS_PLANAR_A: return (p->pitchA) ? p->height : 0; } return p->height; // Y, G, B, R, A } AVSC_INLINE const BYTE * avs_get_read_ptr_p(const AVS_VideoFrame * p, int plane) { switch (plane) { // G is first. Then B,R order like U,V case AVS_PLANAR_U: case AVS_PLANAR_B: return p->vfb->data + p->offsetU; case AVS_PLANAR_V: case AVS_PLANAR_R: return p->vfb->data + p->offsetV; case AVS_PLANAR_A: return p->vfb->data + p->offsetA; } return p->vfb->data + p->offset; // Y, G } #endif AVSC_API(int, avs_is_writable)(const AVS_VideoFrame * p); // V9 AVSC_API(int, avs_is_property_writable)(const AVS_VideoFrame* p); AVSC_API(BYTE *, avs_get_write_ptr_p)(const AVS_VideoFrame * p, int plane); AVSC_API(void, avs_release_video_frame)(AVS_VideoFrame *); // makes a shallow copy of a video frame AVSC_API(AVS_VideoFrame *, avs_copy_video_frame)(AVS_VideoFrame *); // no API for these, inline helper functions AVSC_INLINE int avs_get_pitch(const AVS_VideoFrame * p) {return avs_get_pitch_p(p, 0);} AVSC_INLINE int avs_get_row_size(const AVS_VideoFrame * p) {return avs_get_row_size_p(p, 0);} AVSC_INLINE int avs_get_height(const AVS_VideoFrame * p) {return avs_get_height_p(p, 0);} AVSC_INLINE const BYTE* avs_get_read_ptr(const AVS_VideoFrame * p) {return avs_get_read_ptr_p(p, 0);} #ifndef AVSC_NO_DECLSPEC // this inline function is calling an API function AVSC_INLINE BYTE* avs_get_write_ptr(const AVS_VideoFrame * p) {return avs_get_write_ptr_p(p, 0);} #endif #ifndef AVSC_NO_DECLSPEC // this inline function is calling an API function AVSC_INLINE void avs_release_frame(AVS_VideoFrame * f) {avs_release_video_frame(f);} #endif #ifndef AVSC_NO_DECLSPEC // this inline function is calling an API function AVSC_INLINE AVS_VideoFrame * avs_copy_frame(AVS_VideoFrame * f) {return avs_copy_video_frame(f);} #endif // Interface V8: frame properties // AVS_Map is just a placeholder for AVSMap typedef struct AVS_Map { void* data; } AVS_Map; ///////////////////////////////////////////////////////////////////// // // AVS_Value // // Treat AVS_Value as a fat pointer. That is use avs_copy_value // and avs_release_value appropriately as you would if AVS_Value was // a pointer. // To maintain source code compatibility with future versions of the // avisynth_c API don't use the AVS_Value directly. Use the helper // functions below. // AVS_Value is laid out identically to AVSValue typedef struct AVS_Value AVS_Value; struct AVS_Value { short type; // 'a'rray, 'c'lip, 'b'ool, 'i'nt, 'f'loat, 's'tring, 'v'oid, or 'l'ong, or fu'n'ction // for some function e'rror short array_size; union { void * clip; // do not use directly, use avs_take_clip char boolean; int integer; float floating_pt; const char * string; const AVS_Value * array; void * function; // not supported on C interface #ifdef AVS_X86_64 // if ever, only x64 will support. It breaks struct size on 32 bit int64_t longlong; // 8 bytes double double_pt; // 8 bytes #endif } d; }; // AVS_Value should be initialized with avs_void. // Should also set to avs_void after the value is released // with avs_copy_value. Consider it the equivalent of setting // a pointer to NULL static const AVS_Value avs_void = {'v'}; AVSC_API(void, avs_copy_value)(AVS_Value * dest, AVS_Value src); AVSC_API(void, avs_release_value)(AVS_Value); AVSC_API(AVS_Clip *, avs_take_clip)(AVS_Value, AVS_ScriptEnvironment *); AVSC_API(void, avs_set_to_clip)(AVS_Value *, AVS_Clip *); // no API for these, inline helper functions AVSC_INLINE int avs_defined(AVS_Value v) { return v.type != 'v'; } AVSC_INLINE int avs_is_clip(AVS_Value v) { return v.type == 'c'; } AVSC_INLINE int avs_is_bool(AVS_Value v) { return v.type == 'b'; } AVSC_INLINE int avs_is_int(AVS_Value v) { return v.type == 'i'; } AVSC_INLINE int avs_is_float(AVS_Value v) { return v.type == 'f' || v.type == 'i'; } AVSC_INLINE int avs_is_string(AVS_Value v) { return v.type == 's'; } AVSC_INLINE int avs_is_array(AVS_Value v) { return v.type == 'a'; } AVSC_INLINE int avs_is_error(AVS_Value v) { return v.type == 'e'; } AVSC_INLINE int avs_as_bool(AVS_Value v) { return v.d.boolean; } AVSC_INLINE int avs_as_int(AVS_Value v) { return v.d.integer; } AVSC_INLINE const char * avs_as_string(AVS_Value v) { return avs_is_error(v) || avs_is_string(v) ? v.d.string : 0; } AVSC_INLINE double avs_as_float(AVS_Value v) { return avs_is_int(v) ? v.d.integer : v.d.floating_pt; } AVSC_INLINE const char * avs_as_error(AVS_Value v) { return avs_is_error(v) ? v.d.string : 0; } AVSC_INLINE const AVS_Value * avs_as_array(AVS_Value v) { return v.d.array; } AVSC_INLINE int avs_array_size(AVS_Value v) { return avs_is_array(v) ? v.array_size : 1; } AVSC_INLINE AVS_Value avs_array_elt(AVS_Value v, int index) { return avs_is_array(v) ? v.d.array[index] : v; } // only use these functions on an AVS_Value that does not already have // an active value. Remember, treat AVS_Value as a fat pointer. AVSC_INLINE AVS_Value avs_new_value_bool(int v0) { AVS_Value v; v.type = 'b'; v.d.boolean = v0 == 0 ? 0 : 1; return v; } AVSC_INLINE AVS_Value avs_new_value_int(int v0) { AVS_Value v; v.type = 'i'; v.d.integer = v0; return v; } AVSC_INLINE AVS_Value avs_new_value_string(const char * v0) { AVS_Value v; v.type = 's'; v.d.string = v0; return v; } AVSC_INLINE AVS_Value avs_new_value_float(float v0) { AVS_Value v; v.type = 'f'; v.d.floating_pt = v0; return v; } AVSC_INLINE AVS_Value avs_new_value_error(const char * v0) { AVS_Value v; v.type = 'e'; v.d.string = v0; return v; } #ifndef AVSC_NO_DECLSPEC // this inline function is calling an API function AVSC_INLINE AVS_Value avs_new_value_clip(AVS_Clip * v0) { AVS_Value v; avs_set_to_clip(&v, v0); return v; } #endif AVSC_INLINE AVS_Value avs_new_value_array(AVS_Value * v0, int size) { AVS_Value v; v.type = 'a'; v.d.array = v0; v.array_size = (short)size; return v; } // end of inline helper functions ///////////////////////////////////////////////////////////////////// // // AVS_Clip // AVSC_API(void, avs_release_clip)(AVS_Clip *); AVSC_API(AVS_Clip *, avs_copy_clip)(AVS_Clip *); AVSC_API(const char *, avs_clip_get_error)(AVS_Clip *); // return 0 if no error AVSC_API(const AVS_VideoInfo *, avs_get_video_info)(AVS_Clip *); AVSC_API(int, avs_get_version)(AVS_Clip *); AVSC_API(AVS_VideoFrame *, avs_get_frame)(AVS_Clip *, int n); // The returned video frame must be released with avs_release_video_frame AVSC_API(int, avs_get_parity)(AVS_Clip *, int n); // return field parity if field_based, else parity of first field in frame AVSC_API(int, avs_get_audio)(AVS_Clip *, void * buf, int64_t start, int64_t count); // start and count are in samples AVSC_API(int, avs_set_cache_hints)(AVS_Clip *, int cachehints, int frame_range); // This is the callback type used by avs_add_function typedef AVS_Value (AVSC_CC * AVS_ApplyFunc) (AVS_ScriptEnvironment *, AVS_Value args, void * user_data); typedef struct AVS_FilterInfo AVS_FilterInfo; struct AVS_FilterInfo { // these members should not be modified outside of the AVS_ApplyFunc callback AVS_Clip * child; AVS_VideoInfo vi; AVS_ScriptEnvironment * env; AVS_VideoFrame * (AVSC_CC * get_frame)(AVS_FilterInfo *, int n); int (AVSC_CC * get_parity)(AVS_FilterInfo *, int n); int (AVSC_CC * get_audio)(AVS_FilterInfo *, void * buf, int64_t start, int64_t count); int (AVSC_CC * set_cache_hints)(AVS_FilterInfo *, int cachehints, int frame_range); void (AVSC_CC * free_filter)(AVS_FilterInfo *); // Should be set when ever there is an error to report. // It is cleared before any of the above methods are called const char * error; // this is to store whatever and may be modified at will void * user_data; }; // Create a new filter // fi is set to point to the AVS_FilterInfo so that you can // modify it once it is initialized. // store_child should generally be set to true. If it is not // set than ALL methods (the function pointers) must be defined // If it is set than you do not need to worry about freeing the child // clip. AVSC_API(AVS_Clip *, avs_new_c_filter)(AVS_ScriptEnvironment * e, AVS_FilterInfo * * fi, AVS_Value child, int store_child); ///////////////////////////////////////////////////////////////////// // // AVS_ScriptEnvironment // // For GetCPUFlags. These are backwards-compatible with those in VirtualDub. enum { /* slowest CPU to support extension */ AVS_CPU_FORCE = 0x01, // N/A AVS_CPU_FPU = 0x02, // 386/486DX AVS_CPU_MMX = 0x04, // P55C, K6, PII AVS_CPU_INTEGER_SSE = 0x08, // PIII, Athlon AVS_CPU_SSE = 0x10, // PIII, Athlon XP/MP AVS_CPU_SSE2 = 0x20, // PIV, Hammer AVS_CPU_3DNOW = 0x40, // K6-2 AVS_CPU_3DNOW_EXT = 0x80, // Athlon AVS_CPU_X86_64 = 0xA0, // Hammer (note: equiv. to 3DNow + SSE2, // which only Hammer will have anyway) AVS_CPUF_SSE3 = 0x100, // PIV+, K8 Venice AVS_CPUF_SSSE3 = 0x200, // Core 2 AVS_CPUF_SSE4 = 0x400, // Penryn, Wolfdale, Yorkfield AVS_CPUF_SSE4_1 = 0x400, AVS_CPUF_AVX = 0x800, // Sandy Bridge, Bulldozer AVS_CPUF_SSE4_2 = 0x1000, // Nehalem // AVS+ AVS_CPUF_AVX2 = 0x2000, // Haswell AVS_CPUF_FMA3 = 0x4000, AVS_CPUF_F16C = 0x8000, AVS_CPUF_MOVBE = 0x10000, // Big Endian Move AVS_CPUF_POPCNT = 0x20000, AVS_CPUF_AES = 0x40000, AVS_CPUF_FMA4 = 0x80000, AVS_CPUF_AVX512F = 0x100000, // AVX-512 Foundation. AVS_CPUF_AVX512DQ = 0x200000, // AVX-512 DQ (Double/Quad granular) Instructions AVS_CPUF_AVX512PF = 0x400000, // AVX-512 Prefetch AVS_CPUF_AVX512ER = 0x800000, // AVX-512 Exponential and Reciprocal AVS_CPUF_AVX512CD = 0x1000000, // AVX-512 Conflict Detection AVS_CPUF_AVX512BW = 0x2000000, // AVX-512 BW (Byte/Word granular) Instructions AVS_CPUF_AVX512VL = 0x4000000, // AVX-512 VL (128/256 Vector Length) Extensions AVS_CPUF_AVX512IFMA = 0x8000000, // AVX-512 IFMA integer 52 bit AVS_CPUF_AVX512VBMI = 0x10000000 // AVX-512 VBMI }; AVSC_API(const char *, avs_get_error)(AVS_ScriptEnvironment *); // return 0 if no error AVSC_API(int, avs_get_cpu_flags)(AVS_ScriptEnvironment *); AVSC_API(int, avs_check_version)(AVS_ScriptEnvironment *, int version); AVSC_API(char *, avs_save_string)(AVS_ScriptEnvironment *, const char* s, int length); AVSC_API(char *, avs_sprintf)(AVS_ScriptEnvironment *, const char * fmt, ...); AVSC_API(char *, avs_vsprintf)(AVS_ScriptEnvironment *, const char * fmt, va_list val); AVSC_API(int, avs_add_function)(AVS_ScriptEnvironment *, const char * name, const char * params, AVS_ApplyFunc apply, void * user_data); AVSC_API(int, avs_function_exists)(AVS_ScriptEnvironment *, const char * name); AVSC_API(AVS_Value, avs_invoke)(AVS_ScriptEnvironment *, const char * name, AVS_Value args, const char** arg_names); // The returned value must be be released with avs_release_value AVSC_API(AVS_Value, avs_get_var)(AVS_ScriptEnvironment *, const char* name); // The returned value must be be released with avs_release_value AVSC_API(int, avs_set_var)(AVS_ScriptEnvironment *, const char* name, AVS_Value val); AVSC_API(int, avs_set_global_var)(AVS_ScriptEnvironment *, const char* name, const AVS_Value val); //void avs_push_context(AVS_ScriptEnvironment *, int level=0); //void avs_pop_context(AVS_ScriptEnvironment *); // partially deprecated, from V8 use avs_new_video_frame_p_a (frame property copy) AVSC_API(AVS_VideoFrame *, avs_new_video_frame_a)(AVS_ScriptEnvironment *, const AVS_VideoInfo * vi, int align); // align should be at least 16 for classic Avisynth // Avisynth+: any value, Avs+ ensures a minimum alignment if too small align is provided // no API for these, inline helper functions #ifndef AVSC_NO_DECLSPEC // partially deprecated, from V8 use avs_new_video_frame_p (frame property copy) // this inline function is calling an API function AVSC_INLINE AVS_VideoFrame * avs_new_video_frame(AVS_ScriptEnvironment * env, const AVS_VideoInfo * vi) {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);} // an older compatibility alias // this inline function is calling an API function AVSC_INLINE AVS_VideoFrame * avs_new_frame(AVS_ScriptEnvironment * env, const AVS_VideoInfo * vi) {return avs_new_video_frame_a(env,vi,AVS_FRAME_ALIGN);} #endif // end of inline helper functions AVSC_API(int, avs_make_writable)(AVS_ScriptEnvironment *, AVS_VideoFrame * * pvf); // V9 AVSC_API(int, avs_make_property_writable)(AVS_ScriptEnvironment*, AVS_VideoFrame** pvf); AVSC_API(void, avs_bit_blt)(AVS_ScriptEnvironment *, BYTE* dstp, int dst_pitch, const BYTE* srcp, int src_pitch, int row_size, int height); typedef void (AVSC_CC *AVS_ShutdownFunc)(void* user_data, AVS_ScriptEnvironment * env); AVSC_API(void, avs_at_exit)(AVS_ScriptEnvironment *, AVS_ShutdownFunc function, void * user_data); AVSC_API(AVS_VideoFrame *, avs_subframe)(AVS_ScriptEnvironment *, AVS_VideoFrame * src, int rel_offset, int new_pitch, int new_row_size, int new_height); // The returned video frame must be be released AVSC_API(AVS_VideoFrame*, avs_subframe_planar)(AVS_ScriptEnvironment*, AVS_VideoFrame* src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV); // The returned video frame must be be released // see also avs_subframe_planar_a in interface V8 AVSC_API(int, avs_set_memory_max)(AVS_ScriptEnvironment *, int mem); AVSC_API(int, avs_set_working_dir)(AVS_ScriptEnvironment *, const char * newdir); // avisynth.dll exports this; it's a way to use it as a library, without // writing an AVS script or without going through AVIFile. AVSC_API(AVS_ScriptEnvironment *, avs_create_script_environment)(int version); // this symbol is the entry point for the plugin and must // be defined AVSC_EXPORT const char * AVSC_CC avisynth_c_plugin_init(AVS_ScriptEnvironment* env); AVSC_API(void, avs_delete_script_environment)(AVS_ScriptEnvironment *); /////////////////////////////////////////////////////////////////////////////// // // Avisynth+ V8 interface elements // AVSC_API(AVS_VideoFrame*, avs_subframe_planar_a)(AVS_ScriptEnvironment*, AVS_VideoFrame* src, int rel_offset, int new_pitch, int new_row_size, int new_height, int rel_offsetU, int rel_offsetV, int new_pitchUV, int rel_offsetA); // The returned video frame must be be released AVSC_API(void, avs_copy_frame_props)(AVS_ScriptEnvironment* p, const AVS_VideoFrame* src, AVS_VideoFrame* dst); AVSC_API(const AVS_Map*, avs_get_frame_props_ro)(AVS_ScriptEnvironment* p, const AVS_VideoFrame* frame); AVSC_API(AVS_Map*, avs_get_frame_props_rw)(AVS_ScriptEnvironment* p, AVS_VideoFrame* frame); AVSC_API(int, avs_prop_num_keys)(AVS_ScriptEnvironment* p, const AVS_Map* map); AVSC_API(const char*, avs_prop_get_key)(AVS_ScriptEnvironment* p, const AVS_Map* map, int index); AVSC_API(int, avs_prop_num_elements)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key); // see AVS_PROPTYPE_... enums AVSC_API(char, avs_prop_get_type)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key); // see AVS_GETPROPERROR_... enums AVSC_API(int64_t, avs_prop_get_int)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int index, int* error); AVSC_API(double, avs_prop_get_float)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int index, int* error); AVSC_API(const char*, avs_prop_get_data)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int index, int* error); AVSC_API(int, avs_prop_get_data_size)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int index, int* error); AVSC_API(AVS_Clip*, avs_prop_get_clip)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int index, int* error); AVSC_API(const AVS_VideoFrame*, avs_prop_get_frame)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int index, int* error); AVSC_API(int, avs_prop_delete_key)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key); // see AVS_PROPAPPENDMODE_... enums AVSC_API(int, avs_prop_set_int)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, int64_t i, int append); AVSC_API(int, avs_prop_set_float)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, double d, int append); AVSC_API(int, avs_prop_set_data)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, const char* d, int length, int append); AVSC_API(int, avs_prop_set_clip)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, AVS_Clip* clip, int append); AVSC_API(int, avs_prop_set_frame)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, const AVS_VideoFrame* frame, int append); AVSC_API(const int64_t*, avs_prop_get_int_array)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int* error); AVSC_API(const double*, avs_prop_get_float_array)(AVS_ScriptEnvironment* p, const AVS_Map* map, const char* key, int* error); AVSC_API(int, avs_prop_set_int_array)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, const int64_t* i, int size); AVSC_API(int, avs_prop_set_float_array)(AVS_ScriptEnvironment* p, AVS_Map* map, const char* key, const double* d, int size); AVSC_API(void, avs_clear_map)(AVS_ScriptEnvironment* p, AVS_Map* map); // with frame property source AVSC_API(AVS_VideoFrame*, avs_new_video_frame_p)(AVS_ScriptEnvironment*, const AVS_VideoInfo* vi, AVS_VideoFrame* propSrc); // with frame property source AVSC_API(AVS_VideoFrame*, avs_new_video_frame_p_a)(AVS_ScriptEnvironment*, const AVS_VideoInfo* vi, AVS_VideoFrame* propSrc, int align); // Generic query to ask for various system properties, see AVS_AEP_xxx enums AVSC_API(size_t, avs_get_env_property)(AVS_ScriptEnvironment*, int avs_aep_prop); // buffer pool, see AVS_ALLOCTYPE enums AVSC_API(void *, avs_pool_allocate)(AVS_ScriptEnvironment*, size_t nBytes, size_t alignment, int avs_alloc_type); AVSC_API(void, avs_pool_free)(AVS_ScriptEnvironment*, void *ptr); // Interface V8 // Returns TRUE (1) and the requested variable. If the method fails, returns 0 (FALSE) and does not touch 'val'. // The returned AVS_Value *val value must be be released with avs_release_value only on success // AVS_Value *val is not caller allocated AVSC_API(int, avs_get_var_try)(AVS_ScriptEnvironment*, const char* name, AVS_Value* val); // Interface V8 // Return the value of the requested variable. // If the variable was not found or had the wrong type, // return the supplied default value. AVSC_API(int, avs_get_var_bool)(AVS_ScriptEnvironment*, const char* name, int def); AVSC_API(int, avs_get_var_int)(AVS_ScriptEnvironment*, const char* name, int def); AVSC_API(double, avs_get_var_double)(AVS_ScriptEnvironment*, const char* name, double def); AVSC_API(const char*, avs_get_var_string)(AVS_ScriptEnvironment*, const char* name, const char* def); AVSC_API(int64_t, avs_get_var_long)(AVS_ScriptEnvironment*, const char* name, int64_t def); #if defined(AVS_WINDOWS) // The following stuff is only relevant for Windows DLL handling; Linux does it completely differently. #ifdef AVSC_NO_DECLSPEC // This part uses LoadLibrary and related functions to dynamically load Avisynth instead of declspec(dllimport) // When AVSC_NO_DECLSPEC is defined, you can use avs_load_library to populate API functions into a struct // AVSC_INLINE functions which call onto an API functions should be treated specially (todo) /* The following functions needs to have been declared, probably from windows.h void* malloc(size_t) void free(void*); HMODULE LoadLibraryA(const char*); void* GetProcAddress(HMODULE, const char*); FreeLibrary(HMODULE); */ typedef struct AVS_Library AVS_Library; #define AVSC_DECLARE_FUNC(name) name##_func name // AVSC_DECLARE_FUNC helps keeping naming convention: type is xxxxx_func, function name is xxxxx // e.g. "AVSC_DECLARE_FUNC(avs_add_function);" // is a shortcut for "avs_add_function_func avs_add_function;" // Note: AVSC_INLINE functions which call into API, // are guarded by #ifndef AVSC_NO_DECLSPEC // They should call the appropriate library-> API entry struct AVS_Library { HMODULE handle; AVSC_DECLARE_FUNC(avs_add_function); AVSC_DECLARE_FUNC(avs_at_exit); AVSC_DECLARE_FUNC(avs_bit_blt); AVSC_DECLARE_FUNC(avs_check_version); AVSC_DECLARE_FUNC(avs_clip_get_error); AVSC_DECLARE_FUNC(avs_copy_clip); AVSC_DECLARE_FUNC(avs_copy_value); AVSC_DECLARE_FUNC(avs_copy_video_frame); AVSC_DECLARE_FUNC(avs_create_script_environment); AVSC_DECLARE_FUNC(avs_delete_script_environment); AVSC_DECLARE_FUNC(avs_function_exists); AVSC_DECLARE_FUNC(avs_get_audio); AVSC_DECLARE_FUNC(avs_get_cpu_flags); AVSC_DECLARE_FUNC(avs_get_frame); AVSC_DECLARE_FUNC(avs_get_parity); AVSC_DECLARE_FUNC(avs_get_var); AVSC_DECLARE_FUNC(avs_get_version); AVSC_DECLARE_FUNC(avs_get_video_info); AVSC_DECLARE_FUNC(avs_invoke); AVSC_DECLARE_FUNC(avs_make_writable); AVSC_DECLARE_FUNC(avs_new_c_filter); AVSC_DECLARE_FUNC(avs_new_video_frame_a); AVSC_DECLARE_FUNC(avs_release_clip); AVSC_DECLARE_FUNC(avs_release_value); AVSC_DECLARE_FUNC(avs_release_video_frame); AVSC_DECLARE_FUNC(avs_save_string); AVSC_DECLARE_FUNC(avs_set_cache_hints); AVSC_DECLARE_FUNC(avs_set_global_var); AVSC_DECLARE_FUNC(avs_set_memory_max); AVSC_DECLARE_FUNC(avs_set_to_clip); AVSC_DECLARE_FUNC(avs_set_var); AVSC_DECLARE_FUNC(avs_set_working_dir); AVSC_DECLARE_FUNC(avs_sprintf); AVSC_DECLARE_FUNC(avs_subframe); AVSC_DECLARE_FUNC(avs_subframe_planar); AVSC_DECLARE_FUNC(avs_take_clip); AVSC_DECLARE_FUNC(avs_vsprintf); AVSC_DECLARE_FUNC(avs_get_error); AVSC_DECLARE_FUNC(avs_is_yv24); AVSC_DECLARE_FUNC(avs_is_yv16); AVSC_DECLARE_FUNC(avs_is_yv12); AVSC_DECLARE_FUNC(avs_is_yv411); AVSC_DECLARE_FUNC(avs_is_y8); AVSC_DECLARE_FUNC(avs_is_color_space); AVSC_DECLARE_FUNC(avs_get_plane_width_subsampling); AVSC_DECLARE_FUNC(avs_get_plane_height_subsampling); AVSC_DECLARE_FUNC(avs_bits_per_pixel); AVSC_DECLARE_FUNC(avs_bytes_from_pixels); AVSC_DECLARE_FUNC(avs_row_size); AVSC_DECLARE_FUNC(avs_bmp_size); AVSC_DECLARE_FUNC(avs_get_pitch_p); AVSC_DECLARE_FUNC(avs_get_row_size_p); AVSC_DECLARE_FUNC(avs_get_height_p); AVSC_DECLARE_FUNC(avs_get_read_ptr_p); AVSC_DECLARE_FUNC(avs_is_writable); AVSC_DECLARE_FUNC(avs_get_write_ptr_p); // Avisynth+ specific // Note: these functions are simulated/use fallback to existing functions AVSC_DECLARE_FUNC(avs_is_rgb48); AVSC_DECLARE_FUNC(avs_is_rgb64); AVSC_DECLARE_FUNC(avs_is_yuv444p16); AVSC_DECLARE_FUNC(avs_is_yuv422p16); AVSC_DECLARE_FUNC(avs_is_yuv420p16); AVSC_DECLARE_FUNC(avs_is_y16); AVSC_DECLARE_FUNC(avs_is_yuv444ps); AVSC_DECLARE_FUNC(avs_is_yuv422ps); AVSC_DECLARE_FUNC(avs_is_yuv420ps); AVSC_DECLARE_FUNC(avs_is_y32); AVSC_DECLARE_FUNC(avs_is_444); AVSC_DECLARE_FUNC(avs_is_422); AVSC_DECLARE_FUNC(avs_is_420); AVSC_DECLARE_FUNC(avs_is_y); AVSC_DECLARE_FUNC(avs_is_yuva); AVSC_DECLARE_FUNC(avs_is_planar_rgb); AVSC_DECLARE_FUNC(avs_is_planar_rgba); AVSC_DECLARE_FUNC(avs_num_components); AVSC_DECLARE_FUNC(avs_component_size); AVSC_DECLARE_FUNC(avs_bits_per_component); /////////////////////////////////////////////////////////////////////////////// // Avisynth+ new interface elements from interface version 8 // avs_subframe_planar with alpha support AVSC_DECLARE_FUNC(avs_subframe_planar_a); // frame properties AVSC_DECLARE_FUNC(avs_copy_frame_props); AVSC_DECLARE_FUNC(avs_get_frame_props_ro); AVSC_DECLARE_FUNC(avs_get_frame_props_rw); AVSC_DECLARE_FUNC(avs_prop_num_keys); AVSC_DECLARE_FUNC(avs_prop_get_key); AVSC_DECLARE_FUNC(avs_prop_num_elements); AVSC_DECLARE_FUNC(avs_prop_get_type); AVSC_DECLARE_FUNC(avs_prop_get_int); AVSC_DECLARE_FUNC(avs_prop_get_float); AVSC_DECLARE_FUNC(avs_prop_get_data); AVSC_DECLARE_FUNC(avs_prop_get_data_size); AVSC_DECLARE_FUNC(avs_prop_get_clip); AVSC_DECLARE_FUNC(avs_prop_get_frame); AVSC_DECLARE_FUNC(avs_prop_delete_key); AVSC_DECLARE_FUNC(avs_prop_set_int); AVSC_DECLARE_FUNC(avs_prop_set_float); AVSC_DECLARE_FUNC(avs_prop_set_data); AVSC_DECLARE_FUNC(avs_prop_set_clip); AVSC_DECLARE_FUNC(avs_prop_set_frame); AVSC_DECLARE_FUNC(avs_prop_get_int_array); AVSC_DECLARE_FUNC(avs_prop_get_float_array); AVSC_DECLARE_FUNC(avs_prop_set_int_array); AVSC_DECLARE_FUNC(avs_prop_set_float_array); AVSC_DECLARE_FUNC(avs_clear_map); // NewVideoFrame with frame properties AVSC_DECLARE_FUNC(avs_new_video_frame_p); AVSC_DECLARE_FUNC(avs_new_video_frame_p_a); AVSC_DECLARE_FUNC(avs_get_env_property); AVSC_DECLARE_FUNC(avs_get_var_try); AVSC_DECLARE_FUNC(avs_get_var_bool); AVSC_DECLARE_FUNC(avs_get_var_int); AVSC_DECLARE_FUNC(avs_get_var_double); AVSC_DECLARE_FUNC(avs_get_var_string); AVSC_DECLARE_FUNC(avs_get_var_long); AVSC_DECLARE_FUNC(avs_pool_allocate); AVSC_DECLARE_FUNC(avs_pool_free); // V9 AVSC_DECLARE_FUNC(avs_is_property_writable); AVSC_DECLARE_FUNC(avs_make_property_writable); }; #undef AVSC_DECLARE_FUNC #ifdef AVS26_FALLBACK_SIMULATION // Helper functions for fallback simulation // Avisynth+ extensions do not exist in classic Avisynth so they are simulated AVSC_INLINE int avs_is_xx_fallback_return_false(const AVS_VideoInfo * p) { return 0; } // Avisynth+ extensions do not exist in classic Avisynth so they are simulated AVSC_INLINE int avs_num_components_fallback(const AVS_VideoInfo * p) { switch (p->pixel_type) { case AVS_CS_UNKNOWN: return 0; case AVS_CS_RAW32: case AVS_CS_Y8: return 1; case AVS_CS_BGR32: return 4; // not planar but return the count default: return 3; } } // Avisynth+ extensions do not exist in classic Avisynth so they are simulated AVSC_INLINE int avs_component_size_fallback(const AVS_VideoInfo * p) { return 1; } // Avisynth+ extensions do not exist in classic Avisynth so they are simulated AVSC_INLINE int avs_bits_per_component_fallback(const AVS_VideoInfo * p) { return 8; } // End of helper functions for fallback simulation #endif // AVS26_FALLBACK_SIMULATION // avs_load_library() allocates an array for API procedure entries // reads and fills the entries with live procedure addresses. // AVSC_INLINE helpers which are calling into API procedures are not treated here (todo) AVSC_INLINE AVS_Library * avs_load_library() { AVS_Library *library = (AVS_Library *)malloc(sizeof(AVS_Library)); if (library == NULL) return NULL; library->handle = LoadLibraryA("avisynth"); if (library->handle == NULL) goto fail; #define __AVSC_STRINGIFY(x) #x #define AVSC_STRINGIFY(x) __AVSC_STRINGIFY(x) #define AVSC_LOAD_FUNC(name) {\ library->name = (name##_func) GetProcAddress(library->handle, AVSC_STRINGIFY(name));\ if (library->name == NULL)\ goto fail;\ } #ifdef AVS26_FALLBACK_SIMULATION // When an API function is not loadable, let's try a replacement // Missing Avisynth+ functions will be substituted with classic Avisynth compatible methods /* Avisynth+ When method is missing (classic Avisynth) avs_is_rgb48 constant false avs_is_rgb64 constant false avs_is_444 avs_is_yv24 avs_is_422 avs_is_yv16 avs_is_420 avs_is_yv12 avs_is_y avs_is_y8 avs_is_yuva constant false avs_is_planar_rgb constant false avs_is_planar_rgba constant false avs_num_components special: avs_num_components_fake Y8:1 RGB32:4 else 3 avs_component_size constant 1 (1 bytes/component) avs_bits_per_component constant 8 (8 bits/component) */ // try to load an alternative function #define AVSC_LOAD_FUNC_FALLBACK(name,name2) {\ library->name = (name##_func) GetProcAddress(library->handle, AVSC_STRINGIFY(name));\ if (library->name == NULL)\ library->name = (name##_func) GetProcAddress(library->handle, AVSC_STRINGIFY(name2));\ if (library->name == NULL)\ goto fail;\ } // try to assign a replacement function #define AVSC_LOAD_FUNC_FALLBACK_SIMULATED(name,name2) {\ library->name = (name##_func) GetProcAddress(library->handle, AVSC_STRINGIFY(name));\ if (library->name == NULL)\ library->name = name2;\ if (library->name == NULL)\ goto fail;\ } #endif // AVS26_FALLBACK_SIMULATION AVSC_LOAD_FUNC(avs_add_function); AVSC_LOAD_FUNC(avs_at_exit); AVSC_LOAD_FUNC(avs_bit_blt); AVSC_LOAD_FUNC(avs_check_version); AVSC_LOAD_FUNC(avs_clip_get_error); AVSC_LOAD_FUNC(avs_copy_clip); AVSC_LOAD_FUNC(avs_copy_value); AVSC_LOAD_FUNC(avs_copy_video_frame); AVSC_LOAD_FUNC(avs_create_script_environment); AVSC_LOAD_FUNC(avs_delete_script_environment); AVSC_LOAD_FUNC(avs_function_exists); AVSC_LOAD_FUNC(avs_get_audio); AVSC_LOAD_FUNC(avs_get_cpu_flags); AVSC_LOAD_FUNC(avs_get_frame); AVSC_LOAD_FUNC(avs_get_parity); AVSC_LOAD_FUNC(avs_get_var); AVSC_LOAD_FUNC(avs_get_version); AVSC_LOAD_FUNC(avs_get_video_info); AVSC_LOAD_FUNC(avs_invoke); AVSC_LOAD_FUNC(avs_make_writable); AVSC_LOAD_FUNC(avs_new_c_filter); AVSC_LOAD_FUNC(avs_new_video_frame_a); AVSC_LOAD_FUNC(avs_release_clip); AVSC_LOAD_FUNC(avs_release_value); AVSC_LOAD_FUNC(avs_release_video_frame); AVSC_LOAD_FUNC(avs_save_string); AVSC_LOAD_FUNC(avs_set_cache_hints); AVSC_LOAD_FUNC(avs_set_global_var); AVSC_LOAD_FUNC(avs_set_memory_max); AVSC_LOAD_FUNC(avs_set_to_clip); AVSC_LOAD_FUNC(avs_set_var); AVSC_LOAD_FUNC(avs_set_working_dir); AVSC_LOAD_FUNC(avs_sprintf); AVSC_LOAD_FUNC(avs_subframe); AVSC_LOAD_FUNC(avs_subframe_planar); AVSC_LOAD_FUNC(avs_take_clip); AVSC_LOAD_FUNC(avs_vsprintf); AVSC_LOAD_FUNC(avs_get_error); AVSC_LOAD_FUNC(avs_is_yv24); AVSC_LOAD_FUNC(avs_is_yv16); AVSC_LOAD_FUNC(avs_is_yv12); AVSC_LOAD_FUNC(avs_is_yv411); AVSC_LOAD_FUNC(avs_is_y8); AVSC_LOAD_FUNC(avs_is_color_space); AVSC_LOAD_FUNC(avs_get_plane_width_subsampling); AVSC_LOAD_FUNC(avs_get_plane_height_subsampling); AVSC_LOAD_FUNC(avs_bits_per_pixel); AVSC_LOAD_FUNC(avs_bytes_from_pixels); AVSC_LOAD_FUNC(avs_row_size); AVSC_LOAD_FUNC(avs_bmp_size); AVSC_LOAD_FUNC(avs_get_pitch_p); AVSC_LOAD_FUNC(avs_get_row_size_p); AVSC_LOAD_FUNC(avs_get_height_p); AVSC_LOAD_FUNC(avs_get_read_ptr_p); AVSC_LOAD_FUNC(avs_is_writable); AVSC_LOAD_FUNC(avs_get_write_ptr_p); // Avisynth+ specific #ifdef AVS26_FALLBACK_SIMULATION // replace with fallback fn when does not exist AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_is_rgb48, avs_is_xx_fallback_return_false); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_is_rgb64, avs_is_xx_fallback_return_false); AVSC_LOAD_FUNC_FALLBACK(avs_is_444, avs_is_yv24); AVSC_LOAD_FUNC_FALLBACK(avs_is_422, avs_is_yv16); AVSC_LOAD_FUNC_FALLBACK(avs_is_420, avs_is_yv12); AVSC_LOAD_FUNC_FALLBACK(avs_is_y, avs_is_y8); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_is_yuva, avs_is_xx_fallback_return_false); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_is_planar_rgb, avs_is_xx_fallback_return_false); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_is_planar_rgba, avs_is_xx_fallback_return_false); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_num_components, avs_num_components_fallback); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_component_size, avs_component_size_fallback); AVSC_LOAD_FUNC_FALLBACK_SIMULATED(avs_bits_per_component, avs_bits_per_component_fallback); #else // Avisynth+ specific AVSC_LOAD_FUNC(avs_is_rgb48); AVSC_LOAD_FUNC(avs_is_rgb64); AVSC_LOAD_FUNC(avs_is_444); AVSC_LOAD_FUNC(avs_is_422); AVSC_LOAD_FUNC(avs_is_420); AVSC_LOAD_FUNC(avs_is_y); AVSC_LOAD_FUNC(avs_is_yuva); AVSC_LOAD_FUNC(avs_is_planar_rgb); AVSC_LOAD_FUNC(avs_is_planar_rgba); AVSC_LOAD_FUNC(avs_num_components); AVSC_LOAD_FUNC(avs_component_size); AVSC_LOAD_FUNC(avs_bits_per_component); #endif // Avisynth+ interface V8, no backward compatible simulation AVSC_LOAD_FUNC(avs_subframe_planar_a); // frame properties AVSC_LOAD_FUNC(avs_copy_frame_props); AVSC_LOAD_FUNC(avs_get_frame_props_ro); AVSC_LOAD_FUNC(avs_get_frame_props_rw); AVSC_LOAD_FUNC(avs_prop_num_keys); AVSC_LOAD_FUNC(avs_prop_get_key); AVSC_LOAD_FUNC(avs_prop_num_elements); AVSC_LOAD_FUNC(avs_prop_get_type); AVSC_LOAD_FUNC(avs_prop_get_int); AVSC_LOAD_FUNC(avs_prop_get_float); AVSC_LOAD_FUNC(avs_prop_get_data); AVSC_LOAD_FUNC(avs_prop_get_data_size); AVSC_LOAD_FUNC(avs_prop_get_clip); AVSC_LOAD_FUNC(avs_prop_get_frame); AVSC_LOAD_FUNC(avs_prop_delete_key); AVSC_LOAD_FUNC(avs_prop_set_int); AVSC_LOAD_FUNC(avs_prop_set_float); AVSC_LOAD_FUNC(avs_prop_set_data); AVSC_LOAD_FUNC(avs_prop_set_clip); AVSC_LOAD_FUNC(avs_prop_set_frame); AVSC_LOAD_FUNC(avs_prop_get_int_array); AVSC_LOAD_FUNC(avs_prop_get_float_array); AVSC_LOAD_FUNC(avs_prop_set_int_array); AVSC_LOAD_FUNC(avs_prop_set_float_array); AVSC_LOAD_FUNC(avs_clear_map); // NewVideoFrame with frame properties AVSC_LOAD_FUNC(avs_new_video_frame_p); AVSC_LOAD_FUNC(avs_new_video_frame_p_a); AVSC_LOAD_FUNC(avs_get_env_property); AVSC_LOAD_FUNC(avs_get_var_try); AVSC_LOAD_FUNC(avs_get_var_bool); AVSC_LOAD_FUNC(avs_get_var_int); AVSC_LOAD_FUNC(avs_get_var_double); AVSC_LOAD_FUNC(avs_get_var_string); AVSC_LOAD_FUNC(avs_get_var_long); AVSC_LOAD_FUNC(avs_pool_allocate); AVSC_LOAD_FUNC(avs_pool_free); #undef __AVSC_STRINGIFY #undef AVSC_STRINGIFY #undef AVSC_LOAD_FUNC #undef AVSC_LOAD_FUNC_FALLBACK #undef AVSC_LOAD_FUNC_FALLBACK_SIMULATED return library; fail: free(library); return NULL; } AVSC_INLINE void avs_free_library(AVS_Library *library) { if (library == NULL) return; FreeLibrary(library->handle); free(library); } #endif #endif // AVS_WINDOWS #endif x264-master/extras/cl.h000066400000000000000000001666551502133446700151570ustar00rootroot00000000000000/******************************************************************************* * Copyright (c) 2008 - 2012 The Khronos Group Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and/or associated documentation files (the * "Materials"), to deal in the Materials without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Materials, and to * permit persons to whom the Materials are furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Materials. * * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. ******************************************************************************/ #ifndef __OPENCL_CL_H #define __OPENCL_CL_H #include "cl_platform.h" #ifdef __cplusplus extern "C" { #endif /******************************************************************************/ typedef struct _cl_platform_id * cl_platform_id; typedef struct _cl_device_id * cl_device_id; typedef struct _cl_context * cl_context; typedef struct _cl_command_queue * cl_command_queue; typedef struct _cl_mem * cl_mem; typedef struct _cl_program * cl_program; typedef struct _cl_kernel * cl_kernel; typedef struct _cl_event * cl_event; typedef struct _cl_sampler * cl_sampler; typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ typedef cl_ulong cl_bitfield; typedef cl_bitfield cl_device_type; typedef cl_uint cl_platform_info; typedef cl_uint cl_device_info; typedef cl_bitfield cl_device_fp_config; typedef cl_uint cl_device_mem_cache_type; typedef cl_uint cl_device_local_mem_type; typedef cl_bitfield cl_device_exec_capabilities; typedef cl_bitfield cl_command_queue_properties; typedef intptr_t cl_device_partition_property; typedef cl_bitfield cl_device_affinity_domain; typedef intptr_t cl_context_properties; typedef cl_uint cl_context_info; typedef cl_uint cl_command_queue_info; typedef cl_uint cl_channel_order; typedef cl_uint cl_channel_type; typedef cl_bitfield cl_mem_flags; typedef cl_uint cl_mem_object_type; typedef cl_uint cl_mem_info; typedef cl_bitfield cl_mem_migration_flags; typedef cl_uint cl_image_info; typedef cl_uint cl_buffer_create_type; typedef cl_uint cl_addressing_mode; typedef cl_uint cl_filter_mode; typedef cl_uint cl_sampler_info; typedef cl_bitfield cl_map_flags; typedef cl_uint cl_program_info; typedef cl_uint cl_program_build_info; typedef cl_uint cl_program_binary_type; typedef cl_int cl_build_status; typedef cl_uint cl_kernel_info; typedef cl_uint cl_kernel_arg_info; typedef cl_uint cl_kernel_arg_address_qualifier; typedef cl_uint cl_kernel_arg_access_qualifier; typedef cl_bitfield cl_kernel_arg_type_qualifier; typedef cl_uint cl_kernel_work_group_info; typedef cl_uint cl_event_info; typedef cl_uint cl_command_type; typedef cl_uint cl_profiling_info; typedef struct _cl_image_format { cl_channel_order image_channel_order; cl_channel_type image_channel_data_type; } cl_image_format; typedef struct _cl_image_desc { cl_mem_object_type image_type; size_t image_width; size_t image_height; size_t image_depth; size_t image_array_size; size_t image_row_pitch; size_t image_slice_pitch; cl_uint num_mip_levels; cl_uint num_samples; cl_mem buffer; } cl_image_desc; typedef struct _cl_buffer_region { size_t origin; size_t size; } cl_buffer_region; /******************************************************************************/ /* Error Codes */ #define CL_SUCCESS 0 #define CL_DEVICE_NOT_FOUND -1 #define CL_DEVICE_NOT_AVAILABLE -2 #define CL_COMPILER_NOT_AVAILABLE -3 #define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 #define CL_OUT_OF_RESOURCES -5 #define CL_OUT_OF_HOST_MEMORY -6 #define CL_PROFILING_INFO_NOT_AVAILABLE -7 #define CL_MEM_COPY_OVERLAP -8 #define CL_IMAGE_FORMAT_MISMATCH -9 #define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 #define CL_BUILD_PROGRAM_FAILURE -11 #define CL_MAP_FAILURE -12 #define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 #define CL_COMPILE_PROGRAM_FAILURE -15 #define CL_LINKER_NOT_AVAILABLE -16 #define CL_LINK_PROGRAM_FAILURE -17 #define CL_DEVICE_PARTITION_FAILED -18 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 #define CL_INVALID_VALUE -30 #define CL_INVALID_DEVICE_TYPE -31 #define CL_INVALID_PLATFORM -32 #define CL_INVALID_DEVICE -33 #define CL_INVALID_CONTEXT -34 #define CL_INVALID_QUEUE_PROPERTIES -35 #define CL_INVALID_COMMAND_QUEUE -36 #define CL_INVALID_HOST_PTR -37 #define CL_INVALID_MEM_OBJECT -38 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 #define CL_INVALID_IMAGE_SIZE -40 #define CL_INVALID_SAMPLER -41 #define CL_INVALID_BINARY -42 #define CL_INVALID_BUILD_OPTIONS -43 #define CL_INVALID_PROGRAM -44 #define CL_INVALID_PROGRAM_EXECUTABLE -45 #define CL_INVALID_KERNEL_NAME -46 #define CL_INVALID_KERNEL_DEFINITION -47 #define CL_INVALID_KERNEL -48 #define CL_INVALID_ARG_INDEX -49 #define CL_INVALID_ARG_VALUE -50 #define CL_INVALID_ARG_SIZE -51 #define CL_INVALID_KERNEL_ARGS -52 #define CL_INVALID_WORK_DIMENSION -53 #define CL_INVALID_WORK_GROUP_SIZE -54 #define CL_INVALID_WORK_ITEM_SIZE -55 #define CL_INVALID_GLOBAL_OFFSET -56 #define CL_INVALID_EVENT_WAIT_LIST -57 #define CL_INVALID_EVENT -58 #define CL_INVALID_OPERATION -59 #define CL_INVALID_GL_OBJECT -60 #define CL_INVALID_BUFFER_SIZE -61 #define CL_INVALID_MIP_LEVEL -62 #define CL_INVALID_GLOBAL_WORK_SIZE -63 #define CL_INVALID_PROPERTY -64 #define CL_INVALID_IMAGE_DESCRIPTOR -65 #define CL_INVALID_COMPILER_OPTIONS -66 #define CL_INVALID_LINKER_OPTIONS -67 #define CL_INVALID_DEVICE_PARTITION_COUNT -68 /* OpenCL Version */ #define CL_VERSION_1_0 1 #define CL_VERSION_1_1 1 #define CL_VERSION_1_2 1 /* cl_bool */ #define CL_FALSE 0 #define CL_TRUE 1 #define CL_BLOCKING CL_TRUE #define CL_NON_BLOCKING CL_FALSE /* cl_platform_info */ #define CL_PLATFORM_PROFILE 0x0900 #define CL_PLATFORM_VERSION 0x0901 #define CL_PLATFORM_NAME 0x0902 #define CL_PLATFORM_VENDOR 0x0903 #define CL_PLATFORM_EXTENSIONS 0x0904 /* cl_device_type - bitfield */ #define CL_DEVICE_TYPE_DEFAULT (1 << 0) #define CL_DEVICE_TYPE_CPU (1 << 1) #define CL_DEVICE_TYPE_GPU (1 << 2) #define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) #define CL_DEVICE_TYPE_CUSTOM (1 << 4) #define CL_DEVICE_TYPE_ALL 0xFFFFFFFF /* cl_device_info */ #define CL_DEVICE_TYPE 0x1000 #define CL_DEVICE_VENDOR_ID 0x1001 #define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 #define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B #define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C #define CL_DEVICE_ADDRESS_BITS 0x100D #define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F #define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 #define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 #define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 #define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 #define CL_DEVICE_IMAGE_SUPPORT 0x1016 #define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 #define CL_DEVICE_MAX_SAMPLERS 0x1018 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A #define CL_DEVICE_SINGLE_FP_CONFIG 0x101B #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E #define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 #define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 #define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 #define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 #define CL_DEVICE_ENDIAN_LITTLE 0x1026 #define CL_DEVICE_AVAILABLE 0x1027 #define CL_DEVICE_COMPILER_AVAILABLE 0x1028 #define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 #define CL_DEVICE_QUEUE_PROPERTIES 0x102A #define CL_DEVICE_NAME 0x102B #define CL_DEVICE_VENDOR 0x102C #define CL_DRIVER_VERSION 0x102D #define CL_DEVICE_PROFILE 0x102E #define CL_DEVICE_VERSION 0x102F #define CL_DEVICE_EXTENSIONS 0x1030 #define CL_DEVICE_PLATFORM 0x1031 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 /* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG */ #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 #define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C #define CL_DEVICE_OPENCL_C_VERSION 0x103D #define CL_DEVICE_LINKER_AVAILABLE 0x103E #define CL_DEVICE_BUILT_IN_KERNELS 0x103F #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 #define CL_DEVICE_PARENT_DEVICE 0x1042 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 #define CL_DEVICE_PARTITION_PROPERTIES 0x1044 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 #define CL_DEVICE_PARTITION_TYPE 0x1046 #define CL_DEVICE_REFERENCE_COUNT 0x1047 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B /* cl_device_fp_config - bitfield */ #define CL_FP_DENORM (1 << 0) #define CL_FP_INF_NAN (1 << 1) #define CL_FP_ROUND_TO_NEAREST (1 << 2) #define CL_FP_ROUND_TO_ZERO (1 << 3) #define CL_FP_ROUND_TO_INF (1 << 4) #define CL_FP_FMA (1 << 5) #define CL_FP_SOFT_FLOAT (1 << 6) #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) /* cl_device_mem_cache_type */ #define CL_NONE 0x0 #define CL_READ_ONLY_CACHE 0x1 #define CL_READ_WRITE_CACHE 0x2 /* cl_device_local_mem_type */ #define CL_LOCAL 0x1 #define CL_GLOBAL 0x2 /* cl_device_exec_capabilities - bitfield */ #define CL_EXEC_KERNEL (1 << 0) #define CL_EXEC_NATIVE_KERNEL (1 << 1) /* cl_command_queue_properties - bitfield */ #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) #define CL_QUEUE_PROFILING_ENABLE (1 << 1) /* cl_context_info */ #define CL_CONTEXT_REFERENCE_COUNT 0x1080 #define CL_CONTEXT_DEVICES 0x1081 #define CL_CONTEXT_PROPERTIES 0x1082 #define CL_CONTEXT_NUM_DEVICES 0x1083 /* cl_context_properties */ #define CL_CONTEXT_PLATFORM 0x1084 #define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 /* cl_device_partition_property */ #define CL_DEVICE_PARTITION_EQUALLY 0x1086 #define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 /* cl_device_affinity_domain */ #define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) /* cl_command_queue_info */ #define CL_QUEUE_CONTEXT 0x1090 #define CL_QUEUE_DEVICE 0x1091 #define CL_QUEUE_REFERENCE_COUNT 0x1092 #define CL_QUEUE_PROPERTIES 0x1093 /* cl_mem_flags - bitfield */ #define CL_MEM_READ_WRITE (1 << 0) #define CL_MEM_WRITE_ONLY (1 << 1) #define CL_MEM_READ_ONLY (1 << 2) #define CL_MEM_USE_HOST_PTR (1 << 3) #define CL_MEM_ALLOC_HOST_PTR (1 << 4) #define CL_MEM_COPY_HOST_PTR (1 << 5) // reserved (1 << 6) #define CL_MEM_HOST_WRITE_ONLY (1 << 7) #define CL_MEM_HOST_READ_ONLY (1 << 8) #define CL_MEM_HOST_NO_ACCESS (1 << 9) /* cl_mem_migration_flags - bitfield */ #define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) /* cl_channel_order */ #define CL_R 0x10B0 #define CL_A 0x10B1 #define CL_RG 0x10B2 #define CL_RA 0x10B3 #define CL_RGB 0x10B4 #define CL_RGBA 0x10B5 #define CL_BGRA 0x10B6 #define CL_ARGB 0x10B7 #define CL_INTENSITY 0x10B8 #define CL_LUMINANCE 0x10B9 #define CL_Rx 0x10BA #define CL_RGx 0x10BB #define CL_RGBx 0x10BC #define CL_DEPTH 0x10BD #define CL_DEPTH_STENCIL 0x10BE /* cl_channel_type */ #define CL_SNORM_INT8 0x10D0 #define CL_SNORM_INT16 0x10D1 #define CL_UNORM_INT8 0x10D2 #define CL_UNORM_INT16 0x10D3 #define CL_UNORM_SHORT_565 0x10D4 #define CL_UNORM_SHORT_555 0x10D5 #define CL_UNORM_INT_101010 0x10D6 #define CL_SIGNED_INT8 0x10D7 #define CL_SIGNED_INT16 0x10D8 #define CL_SIGNED_INT32 0x10D9 #define CL_UNSIGNED_INT8 0x10DA #define CL_UNSIGNED_INT16 0x10DB #define CL_UNSIGNED_INT32 0x10DC #define CL_HALF_FLOAT 0x10DD #define CL_FLOAT 0x10DE #define CL_UNORM_INT24 0x10DF /* cl_mem_object_type */ #define CL_MEM_OBJECT_BUFFER 0x10F0 #define CL_MEM_OBJECT_IMAGE2D 0x10F1 #define CL_MEM_OBJECT_IMAGE3D 0x10F2 #define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 #define CL_MEM_OBJECT_IMAGE1D 0x10F4 #define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 #define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 /* cl_mem_info */ #define CL_MEM_TYPE 0x1100 #define CL_MEM_FLAGS 0x1101 #define CL_MEM_SIZE 0x1102 #define CL_MEM_HOST_PTR 0x1103 #define CL_MEM_MAP_COUNT 0x1104 #define CL_MEM_REFERENCE_COUNT 0x1105 #define CL_MEM_CONTEXT 0x1106 #define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 #define CL_MEM_OFFSET 0x1108 /* cl_image_info */ #define CL_IMAGE_FORMAT 0x1110 #define CL_IMAGE_ELEMENT_SIZE 0x1111 #define CL_IMAGE_ROW_PITCH 0x1112 #define CL_IMAGE_SLICE_PITCH 0x1113 #define CL_IMAGE_WIDTH 0x1114 #define CL_IMAGE_HEIGHT 0x1115 #define CL_IMAGE_DEPTH 0x1116 #define CL_IMAGE_ARRAY_SIZE 0x1117 #define CL_IMAGE_BUFFER 0x1118 #define CL_IMAGE_NUM_MIP_LEVELS 0x1119 #define CL_IMAGE_NUM_SAMPLES 0x111A /* cl_addressing_mode */ #define CL_ADDRESS_NONE 0x1130 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 #define CL_ADDRESS_CLAMP 0x1132 #define CL_ADDRESS_REPEAT 0x1133 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134 /* cl_filter_mode */ #define CL_FILTER_NEAREST 0x1140 #define CL_FILTER_LINEAR 0x1141 /* cl_sampler_info */ #define CL_SAMPLER_REFERENCE_COUNT 0x1150 #define CL_SAMPLER_CONTEXT 0x1151 #define CL_SAMPLER_NORMALIZED_COORDS 0x1152 #define CL_SAMPLER_ADDRESSING_MODE 0x1153 #define CL_SAMPLER_FILTER_MODE 0x1154 /* cl_map_flags - bitfield */ #define CL_MAP_READ (1 << 0) #define CL_MAP_WRITE (1 << 1) #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) /* cl_program_info */ #define CL_PROGRAM_REFERENCE_COUNT 0x1160 #define CL_PROGRAM_CONTEXT 0x1161 #define CL_PROGRAM_NUM_DEVICES 0x1162 #define CL_PROGRAM_DEVICES 0x1163 #define CL_PROGRAM_SOURCE 0x1164 #define CL_PROGRAM_BINARY_SIZES 0x1165 #define CL_PROGRAM_BINARIES 0x1166 #define CL_PROGRAM_NUM_KERNELS 0x1167 #define CL_PROGRAM_KERNEL_NAMES 0x1168 /* cl_program_build_info */ #define CL_PROGRAM_BUILD_STATUS 0x1181 #define CL_PROGRAM_BUILD_OPTIONS 0x1182 #define CL_PROGRAM_BUILD_LOG 0x1183 #define CL_PROGRAM_BINARY_TYPE 0x1184 /* cl_program_binary_type */ #define CL_PROGRAM_BINARY_TYPE_NONE 0x0 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 #define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 /* cl_build_status */ #define CL_BUILD_SUCCESS 0 #define CL_BUILD_NONE -1 #define CL_BUILD_ERROR -2 #define CL_BUILD_IN_PROGRESS -3 /* cl_kernel_info */ #define CL_KERNEL_FUNCTION_NAME 0x1190 #define CL_KERNEL_NUM_ARGS 0x1191 #define CL_KERNEL_REFERENCE_COUNT 0x1192 #define CL_KERNEL_CONTEXT 0x1193 #define CL_KERNEL_PROGRAM 0x1194 #define CL_KERNEL_ATTRIBUTES 0x1195 /* cl_kernel_arg_info */ #define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 #define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 #define CL_KERNEL_ARG_TYPE_NAME 0x1198 #define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 #define CL_KERNEL_ARG_NAME 0x119A /* cl_kernel_arg_address_qualifier */ #define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B #define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C #define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D #define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E /* cl_kernel_arg_access_qualifier */ #define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 #define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 #define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 /* cl_kernel_arg_type_qualifer */ #define CL_KERNEL_ARG_TYPE_NONE 0 #define CL_KERNEL_ARG_TYPE_CONST (1 << 0) #define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) #define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) /* cl_kernel_work_group_info */ #define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 #define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 /* cl_event_info */ #define CL_EVENT_COMMAND_QUEUE 0x11D0 #define CL_EVENT_COMMAND_TYPE 0x11D1 #define CL_EVENT_REFERENCE_COUNT 0x11D2 #define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 #define CL_EVENT_CONTEXT 0x11D4 /* cl_command_type */ #define CL_COMMAND_NDRANGE_KERNEL 0x11F0 #define CL_COMMAND_TASK 0x11F1 #define CL_COMMAND_NATIVE_KERNEL 0x11F2 #define CL_COMMAND_READ_BUFFER 0x11F3 #define CL_COMMAND_WRITE_BUFFER 0x11F4 #define CL_COMMAND_COPY_BUFFER 0x11F5 #define CL_COMMAND_READ_IMAGE 0x11F6 #define CL_COMMAND_WRITE_IMAGE 0x11F7 #define CL_COMMAND_COPY_IMAGE 0x11F8 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA #define CL_COMMAND_MAP_BUFFER 0x11FB #define CL_COMMAND_MAP_IMAGE 0x11FC #define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD #define CL_COMMAND_MARKER 0x11FE #define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF #define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 #define CL_COMMAND_READ_BUFFER_RECT 0x1201 #define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 #define CL_COMMAND_COPY_BUFFER_RECT 0x1203 #define CL_COMMAND_USER 0x1204 #define CL_COMMAND_BARRIER 0x1205 #define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 #define CL_COMMAND_FILL_BUFFER 0x1207 #define CL_COMMAND_FILL_IMAGE 0x1208 /* command execution status */ #define CL_COMPLETE 0x0 #define CL_RUNNING 0x1 #define CL_SUBMITTED 0x2 #define CL_QUEUED 0x3 /* cl_buffer_create_type */ #define CL_BUFFER_CREATE_TYPE_REGION 0x1220 /* cl_profiling_info */ #define CL_PROFILING_COMMAND_QUEUED 0x1280 #define CL_PROFILING_COMMAND_SUBMIT 0x1281 #define CL_PROFILING_COMMAND_START 0x1282 #define CL_PROFILING_COMMAND_END 0x1283 /********************************************************************************************************/ /* Platform API */ extern CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(cl_uint /* num_entries */, cl_platform_id * /* platforms */, cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetPlatformInfo(cl_platform_id /* platform */, cl_platform_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Device APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id /* platform */, cl_device_type /* device_type */, cl_uint /* num_entries */, cl_device_id * /* devices */, cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(cl_device_id /* device */, cl_device_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clCreateSubDevices(cl_device_id /* in_device */, const cl_device_partition_property * /* properties */, cl_uint /* num_devices */, cl_device_id * /* out_devices */, cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; /* Context APIs */ extern CL_API_ENTRY cl_context CL_API_CALL clCreateContext(const cl_context_properties * /* properties */, cl_uint /* num_devices */, const cl_device_id * /* devices */, void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(const cl_context_properties * /* properties */, cl_device_type /* device_type */, void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context /* context */, cl_context_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Command Queue APIs */ extern CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueue(cl_context /* context */, cl_device_id /* device */, cl_command_queue_properties /* properties */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetCommandQueueInfo(cl_command_queue /* command_queue */, cl_command_queue_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Memory Object APIs */ extern CL_API_ENTRY cl_mem CL_API_CALL clCreateBuffer(cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */, void * /* host_ptr */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_mem CL_API_CALL clCreateSubBuffer(cl_mem /* buffer */, cl_mem_flags /* flags */, cl_buffer_create_type /* buffer_create_type */, const void * /* buffer_create_info */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, const cl_image_desc * /* image_desc */, void * /* host_ptr */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetSupportedImageFormats(cl_context /* context */, cl_mem_flags /* flags */, cl_mem_object_type /* image_type */, cl_uint /* num_entries */, cl_image_format * /* image_formats */, cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetMemObjectInfo(cl_mem /* memobj */, cl_mem_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetImageInfo(cl_mem /* image */, cl_image_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clSetMemObjectDestructorCallback( cl_mem /* memobj */, void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; /* Sampler APIs */ extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSampler(cl_context /* context */, cl_bool /* normalized_coords */, cl_addressing_mode /* addressing_mode */, cl_filter_mode /* filter_mode */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetSamplerInfo(cl_sampler /* sampler */, cl_sampler_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Program Object APIs */ extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithSource(cl_context /* context */, cl_uint /* count */, const char ** /* strings */, const size_t * /* lengths */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithBinary(cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const size_t * /* lengths */, const unsigned char ** /* binaries */, cl_int * /* binary_status */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* kernel_names */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clBuildProgram(cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* options */, void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clCompileProgram(cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* options */, cl_uint /* num_input_headers */, const cl_program * /* input_headers */, const char ** /* header_include_names */, void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_program CL_API_CALL clLinkProgram(cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, const char * /* options */, cl_uint /* num_input_programs */, const cl_program * /* input_programs */, void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), void * /* user_data */, cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clGetProgramInfo(cl_program /* program */, cl_program_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetProgramBuildInfo(cl_program /* program */, cl_device_id /* device */, cl_program_build_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Kernel Object APIs */ extern CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program /* program */, const char * /* kernel_name */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clCreateKernelsInProgram(cl_program /* program */, cl_uint /* num_kernels */, cl_kernel * /* kernels */, cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel /* kernel */, cl_uint /* arg_index */, size_t /* arg_size */, const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelInfo(cl_kernel /* kernel */, cl_kernel_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelArgInfo(cl_kernel /* kernel */, cl_uint /* arg_indx */, cl_kernel_arg_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel /* kernel */, cl_device_id /* device */, cl_kernel_work_group_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Event Object APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clWaitForEvents(cl_uint /* num_events */, const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetEventInfo(cl_event /* event */, cl_event_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_event CL_API_CALL clCreateUserEvent(cl_context /* context */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clSetUserEventStatus(cl_event /* event */, cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clSetEventCallback( cl_event /* event */, cl_int /* command_exec_callback_type */, void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; /* Profiling APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clGetEventProfilingInfo(cl_event /* event */, cl_profiling_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; /* Flush and Finish APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; /* Enqueued Commands APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBuffer(cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */, size_t /* offset */, size_t /* size */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */, const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */, size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */, size_t /* host_slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBuffer(cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */, size_t /* offset */, size_t /* size */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */, const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */, size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */, size_t /* host_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueFillBuffer(cl_command_queue /* command_queue */, cl_mem /* buffer */, const void * /* pattern */, size_t /* pattern_size */, size_t /* offset */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBuffer(cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */, size_t /* src_offset */, size_t /* dst_offset */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */, const size_t * /* src_origin */, const size_t * /* dst_origin */, const size_t * /* region */, size_t /* src_row_pitch */, size_t /* src_slice_pitch */, size_t /* dst_row_pitch */, size_t /* dst_slice_pitch */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadImage(cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_read */, const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* row_pitch */, size_t /* slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteImage(cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_write */, const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* input_row_pitch */, size_t /* input_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueFillImage(cl_command_queue /* command_queue */, cl_mem /* image */, const void * /* fill_color */, const size_t * /* origin[3] */, const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_image */, const size_t * /* src_origin[3] */, const size_t * /* dst_origin[3] */, const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_buffer */, const size_t * /* src_origin[3] */, const size_t * /* region[3] */, size_t /* dst_offset */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_image */, size_t /* src_offset */, const size_t * /* dst_origin[3] */, const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY void * CL_API_CALL clEnqueueMapBuffer(cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_map */, cl_map_flags /* map_flags */, size_t /* offset */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY void * CL_API_CALL clEnqueueMapImage(cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_map */, cl_map_flags /* map_flags */, const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, cl_mem /* memobj */, void * /* mapped_ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */, const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* work_dim */, const size_t * /* global_work_offset */, const size_t * /* global_work_size */, const size_t * /* local_work_size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueTask(cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueNativeKernel(cl_command_queue /* command_queue */, void (CL_CALLBACK * /*user_func*/)(void *), void * /* args */, size_t /* cb_args */, cl_uint /* num_mem_objects */, const cl_mem * /* mem_list */, const void ** /* args_mem_loc */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; /* Extension function access * * Returns the extension function address for the given function name, * or NULL if a valid function can not be found. The client must * check to make sure the address is not NULL, before using or * calling the returned function address. */ extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; // Deprecated OpenCL 1.1 APIs extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, size_t /* image_width */, size_t /* image_height */, size_t /* image_row_pitch */, void * /* host_ptr */, cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage3D(cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, size_t /* image_width */, size_t /* image_height */, size_t /* image_depth */, size_t /* image_row_pitch */, size_t /* image_slice_pitch */, void * /* host_ptr */, cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clEnqueueMarker(cl_command_queue /* command_queue */, cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clEnqueueWaitForEvents(cl_command_queue /* command_queue */, cl_uint /* num_events */, const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_H */ x264-master/extras/cl_platform.h000066400000000000000000001210751502133446700170460ustar00rootroot00000000000000/********************************************************************************** * Copyright (c) 2008-2012 The Khronos Group Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and/or associated documentation files (the * "Materials"), to deal in the Materials without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Materials, and to * permit persons to whom the Materials are furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Materials. * * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. **********************************************************************************/ /* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ #ifndef __CL_PLATFORM_H #define __CL_PLATFORM_H #ifdef __APPLE__ /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ #include #endif #ifdef __cplusplus extern "C" { #endif #if defined(_WIN32) #define CL_API_ENTRY #define CL_API_CALL __stdcall #define CL_CALLBACK __stdcall #else #define CL_API_ENTRY #define CL_API_CALL #define CL_CALLBACK #endif #ifdef __APPLE__ #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) #ifndef UNAVAILABLE_ATTRIBUTE #define UNAVAILABLE_ATTRIBUTE #endif #ifdef AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER #else #define CL_API_SUFFIX__VERSION_1_0 UNAVAILABLE_ATTRIBUTE #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE #endif #ifdef AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 #else #define CL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE #define GCL_API_SUFFIX__VERSION_1_1 UNAVAILABLE_ATTRIBUTE #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATE CL_EXT_SUFFIX__VERSION_1_0 #endif #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 #else #define CL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE #define GCL_API_SUFFIX__VERSION_1_2 UNAVAILABLE_ATTRIBUTE #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK UNAVAILABLE_ATTRIBUTE #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX__VERSION_1_1 #endif #else #define CL_EXTENSION_WEAK_LINK #define CL_API_SUFFIX__VERSION_1_0 #define CL_EXT_SUFFIX__VERSION_1_0 #define CL_API_SUFFIX__VERSION_1_1 #define CL_EXT_SUFFIX__VERSION_1_1 #define CL_API_SUFFIX__VERSION_1_2 #define CL_EXT_SUFFIX__VERSION_1_2 #ifdef __GNUC__ #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #endif #elif _WIN32 #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) #endif #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) #endif #else #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #endif #endif #if (defined (_WIN32) && defined(_MSC_VER)) /* scalar types */ typedef signed __int8 cl_char; typedef unsigned __int8 cl_uchar; typedef signed __int16 cl_short; typedef unsigned __int16 cl_ushort; typedef signed __int32 cl_int; typedef unsigned __int32 cl_uint; typedef signed __int64 cl_long; typedef unsigned __int64 cl_ulong; typedef unsigned __int16 cl_half; typedef float cl_float; typedef double cl_double; /* Macro names and corresponding values defined by OpenCL */ #define CL_CHAR_BIT 8 #define CL_SCHAR_MAX 127 #define CL_SCHAR_MIN (-127-1) #define CL_CHAR_MAX CL_SCHAR_MAX #define CL_CHAR_MIN CL_SCHAR_MIN #define CL_UCHAR_MAX 255 #define CL_SHRT_MAX 32767 #define CL_SHRT_MIN (-32767-1) #define CL_USHRT_MAX 65535 #define CL_INT_MAX 2147483647 #define CL_INT_MIN (-2147483647-1) #define CL_UINT_MAX 0xffffffffU #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) #define CL_FLT_DIG 6 #define CL_FLT_MANT_DIG 24 #define CL_FLT_MAX_10_EXP +38 #define CL_FLT_MAX_EXP +128 #define CL_FLT_MIN_10_EXP -37 #define CL_FLT_MIN_EXP -125 #define CL_FLT_RADIX 2 #define CL_FLT_MAX 340282346638528859811704183484516925440.0f #define CL_FLT_MIN 1.175494350822287507969e-38f #define CL_FLT_EPSILON 0x1.0p-23f #define CL_DBL_DIG 15 #define CL_DBL_MANT_DIG 53 #define CL_DBL_MAX_10_EXP +308 #define CL_DBL_MAX_EXP +1024 #define CL_DBL_MIN_10_EXP -307 #define CL_DBL_MIN_EXP -1021 #define CL_DBL_RADIX 2 #define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 #define CL_DBL_MIN 2.225073858507201383090e-308 #define CL_DBL_EPSILON 2.220446049250313080847e-16 #define CL_M_E 2.718281828459045090796 #define CL_M_LOG2E 1.442695040888963387005 #define CL_M_LOG10E 0.434294481903251816668 #define CL_M_LN2 0.693147180559945286227 #define CL_M_LN10 2.302585092994045901094 #define CL_M_PI 3.141592653589793115998 #define CL_M_PI_2 1.570796326794896557999 #define CL_M_PI_4 0.785398163397448278999 #define CL_M_1_PI 0.318309886183790691216 #define CL_M_2_PI 0.636619772367581382433 #define CL_M_2_SQRTPI 1.128379167095512558561 #define CL_M_SQRT2 1.414213562373095145475 #define CL_M_SQRT1_2 0.707106781186547572737 #define CL_M_E_F 2.71828174591064f #define CL_M_LOG2E_F 1.44269502162933f #define CL_M_LOG10E_F 0.43429449200630f #define CL_M_LN2_F 0.69314718246460f #define CL_M_LN10_F 2.30258512496948f #define CL_M_PI_F 3.14159274101257f #define CL_M_PI_2_F 1.57079637050629f #define CL_M_PI_4_F 0.78539818525314f #define CL_M_1_PI_F 0.31830987334251f #define CL_M_2_PI_F 0.63661974668503f #define CL_M_2_SQRTPI_F 1.12837922573090f #define CL_M_SQRT2_F 1.41421353816986f #define CL_M_SQRT1_2_F 0.70710676908493f #define CL_NAN (CL_INFINITY - CL_INFINITY) #define CL_HUGE_VALF ((cl_float) 1e50) #define CL_HUGE_VAL ((cl_double) 1e500) #define CL_MAXFLOAT CL_FLT_MAX #define CL_INFINITY CL_HUGE_VALF #else #include /* scalar types */ typedef int8_t cl_char; typedef uint8_t cl_uchar; typedef int16_t cl_short __attribute__((aligned(2))); typedef uint16_t cl_ushort __attribute__((aligned(2))); typedef int32_t cl_int __attribute__((aligned(4))); typedef uint32_t cl_uint __attribute__((aligned(4))); typedef int64_t cl_long __attribute__((aligned(8))); typedef uint64_t cl_ulong __attribute__((aligned(8))); typedef uint16_t cl_half __attribute__((aligned(2))); typedef float cl_float __attribute__((aligned(4))); typedef double cl_double __attribute__((aligned(8))); /* Macro names and corresponding values defined by OpenCL */ #define CL_CHAR_BIT 8 #define CL_SCHAR_MAX 127 #define CL_SCHAR_MIN (-127-1) #define CL_CHAR_MAX CL_SCHAR_MAX #define CL_CHAR_MIN CL_SCHAR_MIN #define CL_UCHAR_MAX 255 #define CL_SHRT_MAX 32767 #define CL_SHRT_MIN (-32767-1) #define CL_USHRT_MAX 65535 #define CL_INT_MAX 2147483647 #define CL_INT_MIN (-2147483647-1) #define CL_UINT_MAX 0xffffffffU #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) #define CL_FLT_DIG 6 #define CL_FLT_MANT_DIG 24 #define CL_FLT_MAX_10_EXP +38 #define CL_FLT_MAX_EXP +128 #define CL_FLT_MIN_10_EXP -37 #define CL_FLT_MIN_EXP -125 #define CL_FLT_RADIX 2 #define CL_FLT_MAX 0x1.fffffep127f #define CL_FLT_MIN 0x1.0p-126f #define CL_FLT_EPSILON 0x1.0p-23f #define CL_DBL_DIG 15 #define CL_DBL_MANT_DIG 53 #define CL_DBL_MAX_10_EXP +308 #define CL_DBL_MAX_EXP +1024 #define CL_DBL_MIN_10_EXP -307 #define CL_DBL_MIN_EXP -1021 #define CL_DBL_RADIX 2 #define CL_DBL_MAX 0x1.fffffffffffffp1023 #define CL_DBL_MIN 0x1.0p-1022 #define CL_DBL_EPSILON 0x1.0p-52 #define CL_M_E 2.718281828459045090796 #define CL_M_LOG2E 1.442695040888963387005 #define CL_M_LOG10E 0.434294481903251816668 #define CL_M_LN2 0.693147180559945286227 #define CL_M_LN10 2.302585092994045901094 #define CL_M_PI 3.141592653589793115998 #define CL_M_PI_2 1.570796326794896557999 #define CL_M_PI_4 0.785398163397448278999 #define CL_M_1_PI 0.318309886183790691216 #define CL_M_2_PI 0.636619772367581382433 #define CL_M_2_SQRTPI 1.128379167095512558561 #define CL_M_SQRT2 1.414213562373095145475 #define CL_M_SQRT1_2 0.707106781186547572737 #define CL_M_E_F 2.71828174591064f #define CL_M_LOG2E_F 1.44269502162933f #define CL_M_LOG10E_F 0.43429449200630f #define CL_M_LN2_F 0.69314718246460f #define CL_M_LN10_F 2.30258512496948f #define CL_M_PI_F 3.14159274101257f #define CL_M_PI_2_F 1.57079637050629f #define CL_M_PI_4_F 0.78539818525314f #define CL_M_1_PI_F 0.31830987334251f #define CL_M_2_PI_F 0.63661974668503f #define CL_M_2_SQRTPI_F 1.12837922573090f #define CL_M_SQRT2_F 1.41421353816986f #define CL_M_SQRT1_2_F 0.70710676908493f #if defined( __GNUC__ ) #define CL_HUGE_VALF __builtin_huge_valf() #define CL_HUGE_VAL __builtin_huge_val() #define CL_NAN __builtin_nanf( "" ) #else #define CL_HUGE_VALF ((cl_float) 1e50) #define CL_HUGE_VAL ((cl_double) 1e500) float nanf( const char * ); #define CL_NAN nanf( "" ) #endif #define CL_MAXFLOAT CL_FLT_MAX #define CL_INFINITY CL_HUGE_VALF #endif #include /* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ typedef unsigned int cl_GLuint; typedef int cl_GLint; typedef unsigned int cl_GLenum; /* * Vector types * * Note: OpenCL requires that all types be naturally aligned. * This means that vector types must be naturally aligned. * For example, a vector of four floats must be aligned to * a 16 byte boundary (calculated as 4 * the natural 4-byte * alignment of the float). The alignment qualifiers here * will only function properly if your compiler supports them * and if you don't actively work to defeat them. For example, * in order for a cl_float4 to be 16 byte aligned in a struct, * the start of the struct must itself be 16-byte aligned. * * Maintaining proper alignment is the user's responsibility. */ /* Define basic vector types */ #if defined( __VEC__ ) #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ typedef vector unsigned char __cl_uchar16; typedef vector signed char __cl_char16; typedef vector unsigned short __cl_ushort8; typedef vector signed short __cl_short8; typedef vector unsigned int __cl_uint4; typedef vector signed int __cl_int4; typedef vector float __cl_float4; #define __CL_UCHAR16__ 1 #define __CL_CHAR16__ 1 #define __CL_USHORT8__ 1 #define __CL_SHORT8__ 1 #define __CL_UINT4__ 1 #define __CL_INT4__ 1 #define __CL_FLOAT4__ 1 #endif #if defined( __SSE__ ) #if defined( __MINGW64__ ) #include #else #include #endif #if defined( __GNUC__ ) typedef float __cl_float4 __attribute__((vector_size(16))); #else typedef __m128 __cl_float4; #endif #define __CL_FLOAT4__ 1 #endif #if defined( __SSE2__ ) #if defined( __MINGW64__ ) #include #else #include #endif #if defined( __GNUC__ ) typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); typedef cl_char __cl_char16 __attribute__((vector_size(16))); typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); typedef cl_short __cl_short8 __attribute__((vector_size(16))); typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); typedef cl_int __cl_int4 __attribute__((vector_size(16))); typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); typedef cl_long __cl_long2 __attribute__((vector_size(16))); typedef cl_double __cl_double2 __attribute__((vector_size(16))); #else typedef __m128i __cl_uchar16; typedef __m128i __cl_char16; typedef __m128i __cl_ushort8; typedef __m128i __cl_short8; typedef __m128i __cl_uint4; typedef __m128i __cl_int4; typedef __m128i __cl_ulong2; typedef __m128i __cl_long2; typedef __m128d __cl_double2; #endif #define __CL_UCHAR16__ 1 #define __CL_CHAR16__ 1 #define __CL_USHORT8__ 1 #define __CL_SHORT8__ 1 #define __CL_INT4__ 1 #define __CL_UINT4__ 1 #define __CL_ULONG2__ 1 #define __CL_LONG2__ 1 #define __CL_DOUBLE2__ 1 #endif #if defined( __MMX__ ) #include #if defined( __GNUC__ ) typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); typedef cl_char __cl_char8 __attribute__((vector_size(8))); typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); typedef cl_short __cl_short4 __attribute__((vector_size(8))); typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); typedef cl_int __cl_int2 __attribute__((vector_size(8))); typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); typedef cl_long __cl_long1 __attribute__((vector_size(8))); typedef cl_float __cl_float2 __attribute__((vector_size(8))); #else typedef __m64 __cl_uchar8; typedef __m64 __cl_char8; typedef __m64 __cl_ushort4; typedef __m64 __cl_short4; typedef __m64 __cl_uint2; typedef __m64 __cl_int2; typedef __m64 __cl_ulong1; typedef __m64 __cl_long1; typedef __m64 __cl_float2; #endif #define __CL_UCHAR8__ 1 #define __CL_CHAR8__ 1 #define __CL_USHORT4__ 1 #define __CL_SHORT4__ 1 #define __CL_INT2__ 1 #define __CL_UINT2__ 1 #define __CL_ULONG1__ 1 #define __CL_LONG1__ 1 #define __CL_FLOAT2__ 1 #endif #if defined( __AVX__ ) #if defined( __MINGW64__ ) #include #else #include #endif #if defined( __GNUC__ ) typedef cl_float __cl_float8 __attribute__((vector_size(32))); typedef cl_double __cl_double4 __attribute__((vector_size(32))); #else typedef __m256 __cl_float8; typedef __m256d __cl_double4; #endif #define __CL_FLOAT8__ 1 #define __CL_DOUBLE4__ 1 #endif /* Define alignment keys */ #if defined( __GNUC__ ) #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) #elif defined( _WIN32) && (_MSC_VER) /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ /* #include */ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ #define CL_ALIGNED(_x) #else #warning Need to implement some method to align data here #define CL_ALIGNED(_x) #endif /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) /* .xyzw and .s0123...{f|F} are supported */ #define CL_HAS_NAMED_VECTOR_FIELDS 1 /* .hi and .lo are supported */ #define CL_HAS_HI_LO_VECTOR_FIELDS 1 #endif /* Define cl_vector types */ /* ---- cl_charn ---- */ typedef union { cl_char CL_ALIGNED(2) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_char x, y; }; __extension__ struct{ cl_char s0, s1; }; __extension__ struct{ cl_char lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2; #endif }cl_char2; typedef union { cl_char CL_ALIGNED(4) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_char x, y, z, w; }; __extension__ struct{ cl_char s0, s1, s2, s3; }; __extension__ struct{ cl_char2 lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2[2]; #endif #if defined( __CL_CHAR4__) __cl_char4 v4; #endif }cl_char4; /* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ typedef cl_char4 cl_char3; typedef union { cl_char CL_ALIGNED(8) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_char x, y, z, w; }; __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_char4 lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2[4]; #endif #if defined( __CL_CHAR4__) __cl_char4 v4[2]; #endif #if defined( __CL_CHAR8__ ) __cl_char8 v8; #endif }cl_char8; typedef union { cl_char CL_ALIGNED(16) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_char8 lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2[8]; #endif #if defined( __CL_CHAR4__) __cl_char4 v4[4]; #endif #if defined( __CL_CHAR8__ ) __cl_char8 v8[2]; #endif #if defined( __CL_CHAR16__ ) __cl_char16 v16; #endif }cl_char16; /* ---- cl_ucharn ---- */ typedef union { cl_uchar CL_ALIGNED(2) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uchar x, y; }; __extension__ struct{ cl_uchar s0, s1; }; __extension__ struct{ cl_uchar lo, hi; }; #endif #if defined( __cl_uchar2__) __cl_uchar2 v2; #endif }cl_uchar2; typedef union { cl_uchar CL_ALIGNED(4) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uchar x, y, z, w; }; __extension__ struct{ cl_uchar s0, s1, s2, s3; }; __extension__ struct{ cl_uchar2 lo, hi; }; #endif #if defined( __CL_UCHAR2__) __cl_uchar2 v2[2]; #endif #if defined( __CL_UCHAR4__) __cl_uchar4 v4; #endif }cl_uchar4; /* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ typedef cl_uchar4 cl_uchar3; typedef union { cl_uchar CL_ALIGNED(8) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uchar x, y, z, w; }; __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_uchar4 lo, hi; }; #endif #if defined( __CL_UCHAR2__) __cl_uchar2 v2[4]; #endif #if defined( __CL_UCHAR4__) __cl_uchar4 v4[2]; #endif #if defined( __CL_UCHAR8__ ) __cl_uchar8 v8; #endif }cl_uchar8; typedef union { cl_uchar CL_ALIGNED(16) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_uchar8 lo, hi; }; #endif #if defined( __CL_UCHAR2__) __cl_uchar2 v2[8]; #endif #if defined( __CL_UCHAR4__) __cl_uchar4 v4[4]; #endif #if defined( __CL_UCHAR8__ ) __cl_uchar8 v8[2]; #endif #if defined( __CL_UCHAR16__ ) __cl_uchar16 v16; #endif }cl_uchar16; /* ---- cl_shortn ---- */ typedef union { cl_short CL_ALIGNED(4) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_short x, y; }; __extension__ struct{ cl_short s0, s1; }; __extension__ struct{ cl_short lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2; #endif }cl_short2; typedef union { cl_short CL_ALIGNED(8) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_short x, y, z, w; }; __extension__ struct{ cl_short s0, s1, s2, s3; }; __extension__ struct{ cl_short2 lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2[2]; #endif #if defined( __CL_SHORT4__) __cl_short4 v4; #endif }cl_short4; /* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ typedef cl_short4 cl_short3; typedef union { cl_short CL_ALIGNED(16) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_short x, y, z, w; }; __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_short4 lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2[4]; #endif #if defined( __CL_SHORT4__) __cl_short4 v4[2]; #endif #if defined( __CL_SHORT8__ ) __cl_short8 v8; #endif }cl_short8; typedef union { cl_short CL_ALIGNED(32) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_short8 lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2[8]; #endif #if defined( __CL_SHORT4__) __cl_short4 v4[4]; #endif #if defined( __CL_SHORT8__ ) __cl_short8 v8[2]; #endif #if defined( __CL_SHORT16__ ) __cl_short16 v16; #endif }cl_short16; /* ---- cl_ushortn ---- */ typedef union { cl_ushort CL_ALIGNED(4) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ushort x, y; }; __extension__ struct{ cl_ushort s0, s1; }; __extension__ struct{ cl_ushort lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2; #endif }cl_ushort2; typedef union { cl_ushort CL_ALIGNED(8) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ushort x, y, z, w; }; __extension__ struct{ cl_ushort s0, s1, s2, s3; }; __extension__ struct{ cl_ushort2 lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2[2]; #endif #if defined( __CL_USHORT4__) __cl_ushort4 v4; #endif }cl_ushort4; /* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ typedef cl_ushort4 cl_ushort3; typedef union { cl_ushort CL_ALIGNED(16) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ushort x, y, z, w; }; __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_ushort4 lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2[4]; #endif #if defined( __CL_USHORT4__) __cl_ushort4 v4[2]; #endif #if defined( __CL_USHORT8__ ) __cl_ushort8 v8; #endif }cl_ushort8; typedef union { cl_ushort CL_ALIGNED(32) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_ushort8 lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2[8]; #endif #if defined( __CL_USHORT4__) __cl_ushort4 v4[4]; #endif #if defined( __CL_USHORT8__ ) __cl_ushort8 v8[2]; #endif #if defined( __CL_USHORT16__ ) __cl_ushort16 v16; #endif }cl_ushort16; /* ---- cl_intn ---- */ typedef union { cl_int CL_ALIGNED(8) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_int x, y; }; __extension__ struct{ cl_int s0, s1; }; __extension__ struct{ cl_int lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2; #endif }cl_int2; typedef union { cl_int CL_ALIGNED(16) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_int x, y, z, w; }; __extension__ struct{ cl_int s0, s1, s2, s3; }; __extension__ struct{ cl_int2 lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2[2]; #endif #if defined( __CL_INT4__) __cl_int4 v4; #endif }cl_int4; /* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ typedef cl_int4 cl_int3; typedef union { cl_int CL_ALIGNED(32) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_int x, y, z, w; }; __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_int4 lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2[4]; #endif #if defined( __CL_INT4__) __cl_int4 v4[2]; #endif #if defined( __CL_INT8__ ) __cl_int8 v8; #endif }cl_int8; typedef union { cl_int CL_ALIGNED(64) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_int8 lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2[8]; #endif #if defined( __CL_INT4__) __cl_int4 v4[4]; #endif #if defined( __CL_INT8__ ) __cl_int8 v8[2]; #endif #if defined( __CL_INT16__ ) __cl_int16 v16; #endif }cl_int16; /* ---- cl_uintn ---- */ typedef union { cl_uint CL_ALIGNED(8) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uint x, y; }; __extension__ struct{ cl_uint s0, s1; }; __extension__ struct{ cl_uint lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2; #endif }cl_uint2; typedef union { cl_uint CL_ALIGNED(16) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uint x, y, z, w; }; __extension__ struct{ cl_uint s0, s1, s2, s3; }; __extension__ struct{ cl_uint2 lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2[2]; #endif #if defined( __CL_UINT4__) __cl_uint4 v4; #endif }cl_uint4; /* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ typedef cl_uint4 cl_uint3; typedef union { cl_uint CL_ALIGNED(32) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uint x, y, z, w; }; __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_uint4 lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2[4]; #endif #if defined( __CL_UINT4__) __cl_uint4 v4[2]; #endif #if defined( __CL_UINT8__ ) __cl_uint8 v8; #endif }cl_uint8; typedef union { cl_uint CL_ALIGNED(64) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_uint8 lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2[8]; #endif #if defined( __CL_UINT4__) __cl_uint4 v4[4]; #endif #if defined( __CL_UINT8__ ) __cl_uint8 v8[2]; #endif #if defined( __CL_UINT16__ ) __cl_uint16 v16; #endif }cl_uint16; /* ---- cl_longn ---- */ typedef union { cl_long CL_ALIGNED(16) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_long x, y; }; __extension__ struct{ cl_long s0, s1; }; __extension__ struct{ cl_long lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2; #endif }cl_long2; typedef union { cl_long CL_ALIGNED(32) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_long x, y, z, w; }; __extension__ struct{ cl_long s0, s1, s2, s3; }; __extension__ struct{ cl_long2 lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2[2]; #endif #if defined( __CL_LONG4__) __cl_long4 v4; #endif }cl_long4; /* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ typedef cl_long4 cl_long3; typedef union { cl_long CL_ALIGNED(64) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_long x, y, z, w; }; __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_long4 lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2[4]; #endif #if defined( __CL_LONG4__) __cl_long4 v4[2]; #endif #if defined( __CL_LONG8__ ) __cl_long8 v8; #endif }cl_long8; typedef union { cl_long CL_ALIGNED(128) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_long8 lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2[8]; #endif #if defined( __CL_LONG4__) __cl_long4 v4[4]; #endif #if defined( __CL_LONG8__ ) __cl_long8 v8[2]; #endif #if defined( __CL_LONG16__ ) __cl_long16 v16; #endif }cl_long16; /* ---- cl_ulongn ---- */ typedef union { cl_ulong CL_ALIGNED(16) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ulong x, y; }; __extension__ struct{ cl_ulong s0, s1; }; __extension__ struct{ cl_ulong lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2; #endif }cl_ulong2; typedef union { cl_ulong CL_ALIGNED(32) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ulong x, y, z, w; }; __extension__ struct{ cl_ulong s0, s1, s2, s3; }; __extension__ struct{ cl_ulong2 lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2[2]; #endif #if defined( __CL_ULONG4__) __cl_ulong4 v4; #endif }cl_ulong4; /* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ typedef cl_ulong4 cl_ulong3; typedef union { cl_ulong CL_ALIGNED(64) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ulong x, y, z, w; }; __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_ulong4 lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2[4]; #endif #if defined( __CL_ULONG4__) __cl_ulong4 v4[2]; #endif #if defined( __CL_ULONG8__ ) __cl_ulong8 v8; #endif }cl_ulong8; typedef union { cl_ulong CL_ALIGNED(128) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_ulong8 lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2[8]; #endif #if defined( __CL_ULONG4__) __cl_ulong4 v4[4]; #endif #if defined( __CL_ULONG8__ ) __cl_ulong8 v8[2]; #endif #if defined( __CL_ULONG16__ ) __cl_ulong16 v16; #endif }cl_ulong16; /* --- cl_floatn ---- */ typedef union { cl_float CL_ALIGNED(8) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_float x, y; }; __extension__ struct{ cl_float s0, s1; }; __extension__ struct{ cl_float lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2; #endif }cl_float2; typedef union { cl_float CL_ALIGNED(16) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_float x, y, z, w; }; __extension__ struct{ cl_float s0, s1, s2, s3; }; __extension__ struct{ cl_float2 lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2[2]; #endif #if defined( __CL_FLOAT4__) __cl_float4 v4; #endif }cl_float4; /* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ typedef cl_float4 cl_float3; typedef union { cl_float CL_ALIGNED(32) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_float x, y, z, w; }; __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_float4 lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2[4]; #endif #if defined( __CL_FLOAT4__) __cl_float4 v4[2]; #endif #if defined( __CL_FLOAT8__ ) __cl_float8 v8; #endif }cl_float8; typedef union { cl_float CL_ALIGNED(64) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_float8 lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2[8]; #endif #if defined( __CL_FLOAT4__) __cl_float4 v4[4]; #endif #if defined( __CL_FLOAT8__ ) __cl_float8 v8[2]; #endif #if defined( __CL_FLOAT16__ ) __cl_float16 v16; #endif }cl_float16; /* --- cl_doublen ---- */ typedef union { cl_double CL_ALIGNED(16) s[2]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_double x, y; }; __extension__ struct{ cl_double s0, s1; }; __extension__ struct{ cl_double lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2; #endif }cl_double2; typedef union { cl_double CL_ALIGNED(32) s[4]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_double x, y, z, w; }; __extension__ struct{ cl_double s0, s1, s2, s3; }; __extension__ struct{ cl_double2 lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2[2]; #endif #if defined( __CL_DOUBLE4__) __cl_double4 v4; #endif }cl_double4; /* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ typedef cl_double4 cl_double3; typedef union { cl_double CL_ALIGNED(64) s[8]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_double x, y, z, w; }; __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; __extension__ struct{ cl_double4 lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2[4]; #endif #if defined( __CL_DOUBLE4__) __cl_double4 v4[2]; #endif #if defined( __CL_DOUBLE8__ ) __cl_double8 v8; #endif }cl_double8; typedef union { cl_double CL_ALIGNED(128) s[16]; #if defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) __extension__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __extension__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __extension__ struct{ cl_double8 lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2[8]; #endif #if defined( __CL_DOUBLE4__) __cl_double4 v4[4]; #endif #if defined( __CL_DOUBLE8__ ) __cl_double8 v8[2]; #endif #if defined( __CL_DOUBLE16__ ) __cl_double16 v16; #endif }cl_double16; /* Macro to facilitate debugging * Usage: * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" * Each line thereafter of OpenCL C source must end with: \n\ * The last line ends in "; * * Example: * * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ * kernel void foo( int a, float * b ) \n\ * { \n\ * // my comment \n\ * *b[ get_global_id(0)] = a; \n\ * } \n\ * "; * * This should correctly set up the line, (column) and file information for your source * string so you can do source level debugging. */ #define __CL_STRINGIFY( _x ) # _x #define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) #define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" #ifdef __cplusplus } #endif #endif /* __CL_PLATFORM_H */ x264-master/extras/getopt.c000066400000000000000000000733451502133446700160470ustar00rootroot00000000000000/* Getopt for GNU. NOTE: getopt is now part of the C library, so if you don't know what "Keep this file name-space clean" means, talk to drepper@gnu.org before changing it! Copyright (C) 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. */ /* This tells Alpha OSF/1 not to define a getopt prototype in . Ditto for AIX 3.2 and . */ #ifndef _NO_PROTO # define _NO_PROTO #endif #ifdef HAVE_CONFIG_H # include #endif #if !defined __STDC__ || !__STDC__ /* This is a separate conditional since some stdc systems reject `defined (const)'. */ # ifndef const # define const # endif #endif #include /* Comment out all this code if we are using the GNU C Library, and are not actually compiling the library itself. This code is part of the GNU C Library, but also included in many other GNU distributions. Compiling and linking in this code is a waste when using the GNU C library (especially if it is a shared library). Rather than having every GNU program understand `configure --with-gnu-libc' and omit the object files, it is simpler to just do this in the source for each such file. */ #define GETOPT_INTERFACE_VERSION 2 #if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 # include # if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION # define ELIDE_CODE # endif #endif #ifndef ELIDE_CODE /* This needs to come after some library #include to get __GNU_LIBRARY__ defined. */ #ifdef __GNU_LIBRARY__ /* Don't include stdlib.h for non-GNU C libraries because some of them contain conflicting prototypes for getopt. */ # include # include #endif /* GNU C library. */ #ifdef VMS # include # if HAVE_STRING_H - 0 # include # endif #endif #ifndef _ /* This is for other GNU distributions with internationalized messages. */ # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC # include # ifndef _ # define _(msgid) gettext (msgid) # endif # else # define _(msgid) (msgid) # endif #endif /* This version of `getopt' appears to the caller like standard Unix `getopt' but it behaves differently for the user, since it allows the user to intersperse the options with the other arguments. As `getopt' works, it permutes the elements of ARGV so that, when it is done, all the options precede everything else. Thus all application programs are extended to handle flexible argument order. Setting the environment variable POSIXLY_CORRECT disables permutation. Then the behavior is completely standard. GNU application programs can use a third alternative mode in which they can distinguish the relative order of options and other arguments. */ #include "getopt.h" /* For communication from `getopt' to the caller. When `getopt' finds an option that takes an argument, the argument value is returned here. Also, when `ordering' is RETURN_IN_ORDER, each non-option ARGV-element is returned here. */ char *optarg; /* Index in ARGV of the next element to be scanned. This is used for communication to and from the caller and for communication between successive calls to `getopt'. On entry to `getopt', zero means this is the first call; initialize. When `getopt' returns -1, this is the index of the first of the non-option elements that the caller should itself scan. Otherwise, `optind' communicates from one call to the next how much of ARGV has been scanned so far. */ /* 1003.2 says this must be 1 before any call. */ int optind = 1; /* Formerly, initialization of getopt depended on optind==0, which causes problems with re-calling getopt as programs generally don't know that. */ int __getopt_initialized; /* The next char to be scanned in the option-element in which the last option character we returned was found. This allows us to pick up the scan where we left off. If this is zero, or a null string, it means resume the scan by advancing to the next ARGV-element. */ static char *nextchar; /* Callers store zero here to inhibit the error message for unrecognized options. */ int opterr = 1; /* Set to an option character which was unrecognized. This must be initialized on some systems to avoid linking in the system's own getopt implementation. */ int optopt = '?'; /* Describe how to deal with options that follow non-option ARGV-elements. If the caller did not specify anything, the default is REQUIRE_ORDER if the environment variable POSIXLY_CORRECT is defined, PERMUTE otherwise. REQUIRE_ORDER means don't recognize them as options; stop option processing when the first non-option is seen. This is what Unix does. This mode of operation is selected by either setting the environment variable POSIXLY_CORRECT, or using `+' as the first character of the list of option characters. PERMUTE is the default. We permute the contents of ARGV as we scan, so that eventually all the non-options are at the end. This allows options to be given in any order, even with programs that were not written to expect this. RETURN_IN_ORDER is an option available to programs that were written to expect options and other ARGV-elements in any order and that care about the ordering of the two. We describe each non-option ARGV-element as if it were the argument of an option with character code 1. Using `-' as the first character of the list of option characters selects this mode of operation. The special argument `--' forces an end of option-scanning regardless of the value of `ordering'. In the case of RETURN_IN_ORDER, only `--' can cause `getopt' to return -1 with `optind' != ARGC. */ static enum { REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER } ordering; /* Value of POSIXLY_CORRECT environment variable. */ static char *posixly_correct; #ifdef __GNU_LIBRARY__ /* We want to avoid inclusion of string.h with non-GNU libraries because there are many ways it can cause trouble. On some systems, it contains special magic macros that don't work in GCC. */ # include # define my_index strchr #else # if HAVE_STRING_H # include # else # include # endif /* Avoid depending on library functions or files whose names are inconsistent. */ #ifndef getenv extern char *getenv (); #endif static char * my_index (str, chr) const char *str; int chr; { while (*str) { if (*str == chr) return (char *) str; str++; } return 0; } /* If using GCC, we can safely declare strlen this way. If not using GCC, it is ok not to declare it. */ #ifdef __GNUC__ /* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. That was relevant to code that was here before. */ # if (!defined __STDC__ || !__STDC__) && !defined strlen /* gcc with -traditional declares the built-in strlen to return int, and has done so at least since version 2.4.5. -- rms. */ extern int strlen (const char *); # endif /* not __STDC__ */ #endif /* __GNUC__ */ #endif /* not __GNU_LIBRARY__ */ /* Handle permutation of arguments. */ /* Describe the part of ARGV that contains non-options that have been skipped. `first_nonopt' is the index in ARGV of the first of them; `last_nonopt' is the index after the last of them. */ static int first_nonopt; static int last_nonopt; #ifdef _LIBC /* Stored original parameters. XXX This is no good solution. We should rather copy the args so that we can compare them later. But we must not use malloc(3). */ extern int __libc_argc; extern char **__libc_argv; /* Bash 2.0 gives us an environment variable containing flags indicating ARGV elements that should not be considered arguments. */ # ifdef USE_NONOPTION_FLAGS /* Defined in getopt_init.c */ extern char *__getopt_nonoption_flags; static int nonoption_flags_max_len; static int nonoption_flags_len; # endif # ifdef USE_NONOPTION_FLAGS # define SWAP_FLAGS(ch1, ch2) \ if (nonoption_flags_len > 0) \ { \ char __tmp = __getopt_nonoption_flags[ch1]; \ __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ __getopt_nonoption_flags[ch2] = __tmp; \ } # else # define SWAP_FLAGS(ch1, ch2) # endif #else /* !_LIBC */ # define SWAP_FLAGS(ch1, ch2) #endif /* _LIBC */ /* Exchange two adjacent subsequences of ARGV. One subsequence is elements [first_nonopt,last_nonopt) which contains all the non-options that have been skipped so far. The other is elements [last_nonopt,optind), which contains all the options processed since those non-options were skipped. `first_nonopt' and `last_nonopt' are relocated so that they describe the new indices of the non-options in ARGV after they are moved. */ #if defined __STDC__ && __STDC__ static void exchange (char **); #endif static void exchange (argv) char **argv; { int bottom = first_nonopt; int middle = last_nonopt; int top = optind; char *tem; /* Exchange the shorter segment with the far end of the longer segment. That puts the shorter segment into the right place. It leaves the longer segment in the right place overall, but it consists of two parts that need to be swapped next. */ #if defined _LIBC && defined USE_NONOPTION_FLAGS /* First make sure the handling of the `__getopt_nonoption_flags' string can work normally. Our top argument must be in the range of the string. */ if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) { /* We must extend the array. The user plays games with us and presents new arguments. */ char *new_str = malloc (top + 1); if (new_str == NULL) nonoption_flags_len = nonoption_flags_max_len = 0; else { memset (__mempcpy (new_str, __getopt_nonoption_flags, nonoption_flags_max_len), '\0', top + 1 - nonoption_flags_max_len); nonoption_flags_max_len = top + 1; __getopt_nonoption_flags = new_str; } } #endif while (top > middle && middle > bottom) { if (top - middle > middle - bottom) { /* Bottom segment is the short one. */ int len = middle - bottom; register int i; /* Swap it with the top part of the top segment. */ for (i = 0; i < len; i++) { tem = argv[bottom + i]; argv[bottom + i] = argv[top - (middle - bottom) + i]; argv[top - (middle - bottom) + i] = tem; SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); } /* Exclude the moved bottom segment from further swapping. */ top -= len; } else { /* Top segment is the short one. */ int len = top - middle; register int i; /* Swap it with the bottom part of the bottom segment. */ for (i = 0; i < len; i++) { tem = argv[bottom + i]; argv[bottom + i] = argv[middle + i]; argv[middle + i] = tem; SWAP_FLAGS (bottom + i, middle + i); } /* Exclude the moved top segment from further swapping. */ bottom += len; } } /* Update records for the slots the non-options now occupy. */ first_nonopt += (optind - last_nonopt); last_nonopt = optind; } /* Initialize the internal data when the first call is made. */ #if defined __STDC__ && __STDC__ static const char *_getopt_initialize (int, char *const *, const char *); #endif static const char * _getopt_initialize (argc, argv, optstring) int argc; char *const *argv; const char *optstring; { /* Start processing options with ARGV-element 1 (since ARGV-element 0 is the program name); the sequence of previously skipped non-option ARGV-elements is empty. */ first_nonopt = last_nonopt = optind; nextchar = NULL; posixly_correct = getenv ("POSIXLY_CORRECT"); /* Determine how to handle the ordering of options and nonoptions. */ if (optstring[0] == '-') { ordering = RETURN_IN_ORDER; ++optstring; } else if (optstring[0] == '+') { ordering = REQUIRE_ORDER; ++optstring; } else if (posixly_correct != NULL) ordering = REQUIRE_ORDER; else ordering = PERMUTE; #if defined _LIBC && defined USE_NONOPTION_FLAGS if (posixly_correct == NULL && argc == __libc_argc && argv == __libc_argv) { if (nonoption_flags_max_len == 0) { if (__getopt_nonoption_flags == NULL || __getopt_nonoption_flags[0] == '\0') nonoption_flags_max_len = -1; else { const char *orig_str = __getopt_nonoption_flags; int len = nonoption_flags_max_len = strlen (orig_str); if (nonoption_flags_max_len < argc) nonoption_flags_max_len = argc; __getopt_nonoption_flags = (char *) malloc (nonoption_flags_max_len); if (__getopt_nonoption_flags == NULL) nonoption_flags_max_len = -1; else memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), '\0', nonoption_flags_max_len - len); } } nonoption_flags_len = nonoption_flags_max_len; } else nonoption_flags_len = 0; #endif return optstring; } /* Scan elements of ARGV (whose length is ARGC) for option characters given in OPTSTRING. If an element of ARGV starts with '-', and is not exactly "-" or "--", then it is an option element. The characters of this element (aside from the initial '-') are option characters. If `getopt' is called repeatedly, it returns successively each of the option characters from each of the option elements. If `getopt' finds another option character, it returns that character, updating `optind' and `nextchar' so that the next call to `getopt' can resume the scan with the following option character or ARGV-element. If there are no more option characters, `getopt' returns -1. Then `optind' is the index in ARGV of the first ARGV-element that is not an option. (The ARGV-elements have been permuted so that those that are not options now come last.) OPTSTRING is a string containing the legitimate option characters. If an option character is seen that is not listed in OPTSTRING, return '?' after printing an error message. If you set `opterr' to zero, the error message is suppressed but we still return '?'. If a char in OPTSTRING is followed by a colon, that means it wants an arg, so the following text in the same ARGV-element, or the text of the following ARGV-element, is returned in `optarg'. Two colons mean an option that wants an optional arg; if there is text in the current ARGV-element, it is returned in `optarg', otherwise `optarg' is set to zero. If OPTSTRING starts with `-' or `+', it requests different methods of handling the non-option ARGV-elements. See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. Long-named options begin with `--' instead of `-'. Their names may be abbreviated as long as the abbreviation is unique or is an exact match for some defined option. If they have an argument, it follows the option name in the same ARGV-element, separated from the option name by a `=', or else the in next ARGV-element. When `getopt' finds a long-named option, it returns 0 if that option's `flag' field is nonzero, the value of the option's `val' field if the `flag' field is zero. The elements of ARGV aren't really const, because we permute them. But we pretend they're const in the prototype to be compatible with other systems. LONGOPTS is a vector of `struct option' terminated by an element containing a name which is zero. LONGIND returns the index in LONGOPT of the long-named option found. It is only valid when a long-named option has been found by the most recent call. If LONG_ONLY is nonzero, '-' as well as '--' can introduce long-named options. */ int _getopt_internal (argc, argv, optstring, longopts, longind, long_only) int argc; char *const *argv; const char *optstring; const struct option *longopts; int *longind; int long_only; { int print_errors = opterr; if (optstring[0] == ':') print_errors = 0; if (argc < 1) return -1; optarg = NULL; if (optind == 0 || !__getopt_initialized) { if (optind == 0) optind = 1; /* Don't scan ARGV[0], the program name. */ optstring = _getopt_initialize (argc, argv, optstring); __getopt_initialized = 1; } /* Test whether ARGV[optind] points to a non-option argument. Either it does not have option syntax, or there is an environment flag from the shell indicating it is not an option. The later information is only used when the used in the GNU libc. */ #if defined _LIBC && defined USE_NONOPTION_FLAGS # define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ || (optind < nonoption_flags_len \ && __getopt_nonoption_flags[optind] == '1')) #else # define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') #endif if (nextchar == NULL || *nextchar == '\0') { /* Advance to the next ARGV-element. */ /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been moved back by the user (who may also have changed the arguments). */ if (last_nonopt > optind) last_nonopt = optind; if (first_nonopt > optind) first_nonopt = optind; if (ordering == PERMUTE) { /* If we have just processed some options following some non-options, exchange them so that the options come first. */ if (first_nonopt != last_nonopt && last_nonopt != optind) exchange ((char **) argv); else if (last_nonopt != optind) first_nonopt = optind; /* Skip any additional non-options and extend the range of non-options previously skipped. */ while (optind < argc && NONOPTION_P) optind++; last_nonopt = optind; } /* The special ARGV-element `--' means premature end of options. Skip it like a null option, then exchange with previous non-options as if it were an option, then skip everything else like a non-option. */ if (optind != argc && !strcmp (argv[optind], "--")) { optind++; if (first_nonopt != last_nonopt && last_nonopt != optind) exchange ((char **) argv); else if (first_nonopt == last_nonopt) first_nonopt = optind; last_nonopt = argc; optind = argc; } /* If we have done all the ARGV-elements, stop the scan and back over any non-options that we skipped and permuted. */ if (optind == argc) { /* Set the next-arg-index to point at the non-options that we previously skipped, so the caller will digest them. */ if (first_nonopt != last_nonopt) optind = first_nonopt; return -1; } /* If we have come to a non-option and did not permute it, either stop the scan or describe it to the caller and pass it by. */ if (NONOPTION_P) { if (ordering == REQUIRE_ORDER) return -1; optarg = argv[optind++]; return 1; } /* We have found another option-ARGV-element. Skip the initial punctuation. */ nextchar = (argv[optind] + 1 + (longopts != NULL && argv[optind][1] == '-')); } /* Decode the current option-ARGV-element. */ /* Check whether the ARGV-element is a long option. If long_only and the ARGV-element has the form "-f", where f is a valid short option, don't consider it an abbreviated form of a long option that starts with f. Otherwise there would be no way to give the -f short option. On the other hand, if there's a long option "fubar" and the ARGV-element is "-fu", do consider that an abbreviation of the long option, just like "--fu", and not "-f" with arg "u". This distinction seems to be the most useful approach. */ if (longopts != NULL && (argv[optind][1] == '-' || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) { char *nameend; const struct option *p; const struct option *pfound = NULL; int exact = 0; int ambig = 0; int indfound = -1; int option_index; for (nameend = nextchar; *nameend && *nameend != '='; nameend++) /* Do nothing. */ ; /* Test all long options for either exact match or abbreviated matches. */ for (p = longopts, option_index = 0; p->name; p++, option_index++) if (!strncmp (p->name, nextchar, nameend - nextchar)) { if ((unsigned int) (nameend - nextchar) == (unsigned int) strlen (p->name)) { /* Exact match found. */ pfound = p; indfound = option_index; exact = 1; break; } else if (pfound == NULL) { /* First nonexact match found. */ pfound = p; indfound = option_index; } else if (long_only || pfound->has_arg != p->has_arg || pfound->flag != p->flag || pfound->val != p->val) /* Second or later nonexact match found. */ ambig = 1; } if (ambig && !exact) { if (print_errors) fprintf (stderr, _("%s: option `%s' is ambiguous\n"), argv[0], argv[optind]); nextchar += strlen (nextchar); optind++; optopt = 0; return '?'; } if (pfound != NULL) { option_index = indfound; optind++; if (*nameend) { /* Don't test has_arg with >, because some C compilers don't allow it to be used on enums. */ if (pfound->has_arg) optarg = nameend + 1; else { if (print_errors) { if (argv[optind - 1][1] == '-') /* --option */ fprintf (stderr, _("%s: option `--%s' doesn't allow an argument\n"), argv[0], pfound->name); else /* +option or -option */ fprintf (stderr, _("%s: option `%c%s' doesn't allow an argument\n"), argv[0], argv[optind - 1][0], pfound->name); } nextchar += strlen (nextchar); optopt = pfound->val; return '?'; } } else if (pfound->has_arg == 1) { if (optind < argc) optarg = argv[optind++]; else { if (print_errors) fprintf (stderr, _("%s: option `%s' requires an argument\n"), argv[0], argv[optind - 1]); nextchar += strlen (nextchar); optopt = pfound->val; return optstring[0] == ':' ? ':' : '?'; } } nextchar += strlen (nextchar); if (longind != NULL) *longind = option_index; if (pfound->flag) { *(pfound->flag) = pfound->val; return 0; } return pfound->val; } /* Can't find it as a long option. If this is not getopt_long_only, or the option starts with '--' or is not a valid short option, then it's an error. Otherwise interpret it as a short option. */ if (!long_only || argv[optind][1] == '-' || my_index (optstring, *nextchar) == NULL) { if (print_errors) { if (argv[optind][1] == '-') /* --option */ fprintf (stderr, _("%s: unrecognized option `--%s'\n"), argv[0], nextchar); else /* +option or -option */ fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), argv[0], argv[optind][0], nextchar); } nextchar = (char *) ""; optind++; optopt = 0; return '?'; } } /* Look at and handle the next short option-character. */ { char c = *nextchar++; char *temp = my_index (optstring, c); /* Increment `optind' when we start to process its last character. */ if (*nextchar == '\0') ++optind; if (temp == NULL || c == ':') { if (print_errors) { if (posixly_correct) /* 1003.2 specifies the format of this message. */ fprintf (stderr, _("%s: illegal option -- %c\n"), argv[0], c); else fprintf (stderr, _("%s: invalid option -- %c\n"), argv[0], c); } optopt = c; return '?'; } /* Convenience. Treat POSIX -W foo same as long option --foo */ if (temp[0] == 'W' && temp[1] == ';') { char *nameend; const struct option *p; const struct option *pfound = NULL; int exact = 0; int ambig = 0; int indfound = 0; int option_index; /* This is an option that requires an argument. */ if (*nextchar != '\0') { optarg = nextchar; /* If we end this ARGV-element by taking the rest as an arg, we must advance to the next element now. */ optind++; } else if (optind == argc) { if (print_errors) { /* 1003.2 specifies the format of this message. */ fprintf (stderr, _("%s: option requires an argument -- %c\n"), argv[0], c); } optopt = c; if (optstring[0] == ':') c = ':'; else c = '?'; return c; } else /* We already incremented `optind' once; increment it again when taking next ARGV-elt as argument. */ optarg = argv[optind++]; /* optarg is now the argument, see if it's in the table of longopts. */ for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) /* Do nothing. */ ; /* Test all long options for either exact match or abbreviated matches. */ for (p = longopts, option_index = 0; p->name; p++, option_index++) if (!strncmp (p->name, nextchar, nameend - nextchar)) { if ((unsigned int) (nameend - nextchar) == strlen (p->name)) { /* Exact match found. */ pfound = p; indfound = option_index; exact = 1; break; } else if (pfound == NULL) { /* First nonexact match found. */ pfound = p; indfound = option_index; } else /* Second or later nonexact match found. */ ambig = 1; } if (ambig && !exact) { if (print_errors) fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), argv[0], argv[optind]); nextchar += strlen (nextchar); optind++; return '?'; } if (pfound != NULL) { option_index = indfound; if (*nameend) { /* Don't test has_arg with >, because some C compilers don't allow it to be used on enums. */ if (pfound->has_arg) optarg = nameend + 1; else { if (print_errors) fprintf (stderr, _("\ %s: option `-W %s' doesn't allow an argument\n"), argv[0], pfound->name); nextchar += strlen (nextchar); return '?'; } } else if (pfound->has_arg == 1) { if (optind < argc) optarg = argv[optind++]; else { if (print_errors) fprintf (stderr, _("%s: option `%s' requires an argument\n"), argv[0], argv[optind - 1]); nextchar += strlen (nextchar); return optstring[0] == ':' ? ':' : '?'; } } nextchar += strlen (nextchar); if (longind != NULL) *longind = option_index; if (pfound->flag) { *(pfound->flag) = pfound->val; return 0; } return pfound->val; } nextchar = NULL; return 'W'; /* Let the application handle it. */ } if (temp[1] == ':') { if (temp[2] == ':') { /* This is an option that accepts an argument optionally. */ if (*nextchar != '\0') { optarg = nextchar; optind++; } else optarg = NULL; nextchar = NULL; } else { /* This is an option that requires an argument. */ if (*nextchar != '\0') { optarg = nextchar; /* If we end this ARGV-element by taking the rest as an arg, we must advance to the next element now. */ optind++; } else if (optind == argc) { if (print_errors) { /* 1003.2 specifies the format of this message. */ fprintf (stderr, _("%s: option requires an argument -- %c\n"), argv[0], c); } optopt = c; if (optstring[0] == ':') c = ':'; else c = '?'; } else /* We already incremented `optind' once; increment it again when taking next ARGV-elt as argument. */ optarg = argv[optind++]; nextchar = NULL; } } return c; } } int getopt (argc, argv, optstring) int argc; char *const *argv; const char *optstring; { return _getopt_internal (argc, argv, optstring, (const struct option *) 0, (int *) 0, 0); } int getopt_long (argc, argv, optstring, long_options, opt_index) int argc; char *const *argv; const char *optstring; const struct option *long_options; int *opt_index; { return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0); } #endif /* Not ELIDE_CODE. */ #ifdef TEST /* Compile with -DTEST to make an executable for use in testing the above definition of `getopt'. */ int main (argc, argv) int argc; char **argv; { int c; int digit_optind = 0; while (1) { int this_option_optind = optind ? optind : 1; c = getopt (argc, argv, "abc:d:0123456789"); if (c == -1) break; switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (digit_optind != 0 && digit_optind != this_option_optind) printf ("digits occur in two different argv-elements.\n"); digit_optind = this_option_optind; printf ("option %c\n", c); break; case 'a': printf ("option a\n"); break; case 'b': printf ("option b\n"); break; case 'c': printf ("option c with value `%s'\n", optarg); break; case '?': break; default: printf ("?? getopt returned character code 0%o ??\n", c); } } if (optind < argc) { printf ("non-option ARGV-elements: "); while (optind < argc) printf ("%s ", argv[optind++]); printf ("\n"); } exit (0); } #endif /* TEST */ x264-master/extras/getopt.h000066400000000000000000000143751502133446700160520ustar00rootroot00000000000000/* Declarations for getopt. Copyright (C) 1989-1994, 1996-1999, 2001 Free Software Foundation, Inc. This file is part of the GNU C Library. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA. */ #ifndef _GETOPT_H #ifndef __need_getopt # define _GETOPT_H 1 #endif /* If __GNU_LIBRARY__ is not already defined, either we are being used standalone, or this is the first header included in the source file. If we are being used with glibc, we need to include , but that does not exist if we are standalone. So: if __GNU_LIBRARY__ is not defined, include , which will pull in for us if it's from glibc. (Why ctype.h? It's guaranteed to exist and it doesn't flood the namespace with stuff the way some other headers do.) */ #if !defined __GNU_LIBRARY__ # include #endif #ifdef __cplusplus extern "C" { #endif /* For communication from `getopt' to the caller. When `getopt' finds an option that takes an argument, the argument value is returned here. Also, when `ordering' is RETURN_IN_ORDER, each non-option ARGV-element is returned here. */ extern char *optarg; /* Index in ARGV of the next element to be scanned. This is used for communication to and from the caller and for communication between successive calls to `getopt'. On entry to `getopt', zero means this is the first call; initialize. When `getopt' returns -1, this is the index of the first of the non-option elements that the caller should itself scan. Otherwise, `optind' communicates from one call to the next how much of ARGV has been scanned so far. */ extern int optind; /* Callers store zero here to inhibit the error message `getopt' prints for unrecognized options. */ extern int opterr; /* Set to an option character which was unrecognized. */ extern int optopt; #ifndef __need_getopt /* Describe the long-named options requested by the application. The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector of `struct option' terminated by an element containing a name which is zero. The field `has_arg' is: no_argument (or 0) if the option does not take an argument, required_argument (or 1) if the option requires an argument, optional_argument (or 2) if the option takes an optional argument. If the field `flag' is not NULL, it points to a variable that is set to the value given in the field `val' when the option is found, but left unchanged if the option is not found. To have a long-named option do something other than set an `int' to a compiled-in constant, such as set a value from `optarg', set the option's `flag' field to zero and its `val' field to a nonzero value (the equivalent single-letter option character, if there is one). For long options that have a zero `flag' field, `getopt' returns the contents of the `val' field. */ struct option { # if (defined __STDC__ && __STDC__) || defined __cplusplus const char *name; # else char *name; # endif /* has_arg can't be an enum because some compilers complain about type mismatches in all the code that assumes it is an int. */ int has_arg; int *flag; int val; }; /* Names for the values of the `has_arg' field of `struct option'. */ # define no_argument 0 # define required_argument 1 # define optional_argument 2 #endif /* need getopt */ /* Get definitions and prototypes for functions to process the arguments in ARGV (ARGC of them, minus the program name) for options given in OPTS. Return the option character from OPTS just read. Return -1 when there are no more options. For unrecognized options, or options missing arguments, `optopt' is set to the option letter, and '?' is returned. The OPTS string is a list of characters which are recognized option letters, optionally followed by colons, specifying that that letter takes an argument, to be placed in `optarg'. If a letter in OPTS is followed by two colons, its argument is optional. This behavior is specific to the GNU `getopt'. The argument `--' causes premature termination of argument scanning, explicitly telling `getopt' that there are no more options. If OPTS begins with `--', then non-option arguments are treated as arguments to the option '\0'. This behavior is specific to the GNU `getopt'. */ #if (defined __STDC__ && __STDC__) || defined __cplusplus # ifdef __GNU_LIBRARY__ /* Many other libraries have conflicting prototypes for getopt, with differences in the consts, in stdlib.h. To avoid compilation errors, only prototype getopt for the GNU C library. */ extern int getopt (int __argc, char *const *__argv, const char *__shortopts); # else /* not __GNU_LIBRARY__ */ extern int getopt (); # endif /* __GNU_LIBRARY__ */ # ifndef __need_getopt extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int *__longind); extern int getopt_long_only (int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int *__longind); /* Internal only. Users should not call this directly. */ extern int _getopt_internal (int __argc, char *const *__argv, const char *__shortopts, const struct option *__longopts, int *__longind, int __long_only); # endif #else /* not __STDC__ */ extern int getopt (); # ifndef __need_getopt extern int getopt_long (); extern int getopt_long_only (); extern int _getopt_internal (); # endif #endif /* __STDC__ */ #ifdef __cplusplus } #endif /* Make sure we later can get all the definitions and declarations. */ #undef __need_getopt #endif /* getopt.h */ x264-master/extras/intel_dispatcher.h000066400000000000000000000035501502133446700200620ustar00rootroot00000000000000/***************************************************************************** * intel_dispatcher.h: intel compiler cpu dispatcher override ***************************************************************************** * Copyright (C) 2014-2025 x264 project * * Authors: Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_INTEL_DISPATCHER_H #define X264_INTEL_DISPATCHER_H /* Feature flags using _FEATURE_* defines from immintrin.h */ extern unsigned long long __intel_cpu_feature_indicator; extern unsigned long long __intel_cpu_feature_indicator_x; /* CPU vendor independent version of dispatcher */ void __intel_cpu_features_init_x( void ); static void x264_intel_dispatcher_override( void ) { if( __intel_cpu_feature_indicator & ~1ULL ) return; __intel_cpu_feature_indicator = 0; __intel_cpu_feature_indicator_x = 0; __intel_cpu_features_init_x(); __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x; } #endif x264-master/extras/inttypes.h000066400000000000000000000166111502133446700164220ustar00rootroot00000000000000// ISO C9x compliant inttypes.h for Microsoft Visual Studio // Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 // // Copyright (c) 2006 Alexander Chemeris // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // 1. Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. The name of the author may be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO // EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // /////////////////////////////////////////////////////////////////////////////// #ifndef _MSC_VER // [ #error "Use this header only with Microsoft Visual C++ compilers!" #endif // _MSC_VER ] #ifndef _MSC_INTTYPES_H_ // [ #define _MSC_INTTYPES_H_ #if _MSC_VER > 1000 #pragma once #endif #include "stdint.h" // 7.8 Format conversion of integer types typedef struct { intmax_t quot; intmax_t rem; } imaxdiv_t; // 7.8.1 Macros for format specifiers #if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 // The fprintf macros for signed integers are: #define PRId8 "d" #define PRIi8 "i" #define PRIdLEAST8 "d" #define PRIiLEAST8 "i" #define PRIdFAST8 "d" #define PRIiFAST8 "i" #define PRId16 "hd" #define PRIi16 "hi" #define PRIdLEAST16 "hd" #define PRIiLEAST16 "hi" #define PRIdFAST16 "hd" #define PRIiFAST16 "hi" #define PRId32 "I32d" #define PRIi32 "I32i" #define PRIdLEAST32 "I32d" #define PRIiLEAST32 "I32i" #define PRIdFAST32 "I32d" #define PRIiFAST32 "I32i" #define PRId64 "I64d" #define PRIi64 "I64i" #define PRIdLEAST64 "I64d" #define PRIiLEAST64 "I64i" #define PRIdFAST64 "I64d" #define PRIiFAST64 "I64i" #define PRIdMAX "I64d" #define PRIiMAX "I64i" #define PRIdPTR "Id" #define PRIiPTR "Ii" // The fprintf macros for unsigned integers are: #define PRIo8 "o" #define PRIu8 "u" #define PRIx8 "x" #define PRIX8 "X" #define PRIoLEAST8 "o" #define PRIuLEAST8 "u" #define PRIxLEAST8 "x" #define PRIXLEAST8 "X" #define PRIoFAST8 "o" #define PRIuFAST8 "u" #define PRIxFAST8 "x" #define PRIXFAST8 "X" #define PRIo16 "ho" #define PRIu16 "hu" #define PRIx16 "hx" #define PRIX16 "hX" #define PRIoLEAST16 "ho" #define PRIuLEAST16 "hu" #define PRIxLEAST16 "hx" #define PRIXLEAST16 "hX" #define PRIoFAST16 "ho" #define PRIuFAST16 "hu" #define PRIxFAST16 "hx" #define PRIXFAST16 "hX" #define PRIo32 "I32o" #define PRIu32 "I32u" #define PRIx32 "I32x" #define PRIX32 "I32X" #define PRIoLEAST32 "I32o" #define PRIuLEAST32 "I32u" #define PRIxLEAST32 "I32x" #define PRIXLEAST32 "I32X" #define PRIoFAST32 "I32o" #define PRIuFAST32 "I32u" #define PRIxFAST32 "I32x" #define PRIXFAST32 "I32X" #define PRIo64 "I64o" #define PRIu64 "I64u" #define PRIx64 "I64x" #define PRIX64 "I64X" #define PRIoLEAST64 "I64o" #define PRIuLEAST64 "I64u" #define PRIxLEAST64 "I64x" #define PRIXLEAST64 "I64X" #define PRIoFAST64 "I64o" #define PRIuFAST64 "I64u" #define PRIxFAST64 "I64x" #define PRIXFAST64 "I64X" #define PRIoMAX "I64o" #define PRIuMAX "I64u" #define PRIxMAX "I64x" #define PRIXMAX "I64X" #define PRIoPTR "Io" #define PRIuPTR "Iu" #define PRIxPTR "Ix" #define PRIXPTR "IX" // The fscanf macros for signed integers are: #define SCNd16 "hd" #define SCNi16 "hi" #define SCNdLEAST16 "hd" #define SCNiLEAST16 "hi" #define SCNdFAST16 "hd" #define SCNiFAST16 "hi" #define SCNd32 "ld" #define SCNi32 "li" #define SCNdLEAST32 "ld" #define SCNiLEAST32 "li" #define SCNdFAST32 "ld" #define SCNiFAST32 "li" #define SCNd64 "I64d" #define SCNi64 "I64i" #define SCNdLEAST64 "I64d" #define SCNiLEAST64 "I64i" #define SCNdFAST64 "I64d" #define SCNiFAST64 "I64i" #define SCNdMAX "I64d" #define SCNiMAX "I64i" #ifdef _WIN64 // [ # define SCNdPTR "I64d" # define SCNiPTR "I64i" #else // _WIN64 ][ # define SCNdPTR "ld" # define SCNiPTR "li" #endif // _WIN64 ] // The fscanf macros for unsigned integers are: #define SCNo16 "ho" #define SCNu16 "hu" #define SCNx16 "hx" #define SCNX16 "hX" #define SCNoLEAST16 "ho" #define SCNuLEAST16 "hu" #define SCNxLEAST16 "hx" #define SCNXLEAST16 "hX" #define SCNoFAST16 "ho" #define SCNuFAST16 "hu" #define SCNxFAST16 "hx" #define SCNXFAST16 "hX" #define SCNo32 "lo" #define SCNu32 "lu" #define SCNx32 "lx" #define SCNX32 "lX" #define SCNoLEAST32 "lo" #define SCNuLEAST32 "lu" #define SCNxLEAST32 "lx" #define SCNXLEAST32 "lX" #define SCNoFAST32 "lo" #define SCNuFAST32 "lu" #define SCNxFAST32 "lx" #define SCNXFAST32 "lX" #define SCNo64 "I64o" #define SCNu64 "I64u" #define SCNx64 "I64x" #define SCNX64 "I64X" #define SCNoLEAST64 "I64o" #define SCNuLEAST64 "I64u" #define SCNxLEAST64 "I64x" #define SCNXLEAST64 "I64X" #define SCNoFAST64 "I64o" #define SCNuFAST64 "I64u" #define SCNxFAST64 "I64x" #define SCNXFAST64 "I64X" #define SCNoMAX "I64o" #define SCNuMAX "I64u" #define SCNxMAX "I64x" #define SCNXMAX "I64X" #ifdef _WIN64 // [ # define SCNoPTR "I64o" # define SCNuPTR "I64u" # define SCNxPTR "I64x" # define SCNXPTR "I64X" #else // _WIN64 ][ # define SCNoPTR "lo" # define SCNuPTR "lu" # define SCNxPTR "lx" # define SCNXPTR "lX" #endif // _WIN64 ] #endif // __STDC_FORMAT_MACROS ] // 7.8.2 Functions for greatest-width integer types // 7.8.2.1 The imaxabs function #define imaxabs _abs64 // 7.8.2.2 The imaxdiv function // This is modified version of div() function from Microsoft's div.c found // in %MSVC.NET%\crt\src\div.c #ifdef STATIC_IMAXDIV // [ static #else // STATIC_IMAXDIV ][ _inline #endif // STATIC_IMAXDIV ] imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) { imaxdiv_t result; result.quot = numer / denom; result.rem = numer % denom; if (numer < 0 && result.rem > 0) { // did division wrong; must fix up ++result.quot; result.rem -= denom; } return result; } // 7.8.2.3 The strtoimax and strtoumax functions #define strtoimax _strtoi64 #define strtoumax _strtoui64 // 7.8.2.4 The wcstoimax and wcstoumax functions #define wcstoimax _wcstoi64 #define wcstoumax _wcstoui64 #endif // _MSC_INTTYPES_H_ ] x264-master/extras/stdint.h000066400000000000000000000140761502133446700160530ustar00rootroot00000000000000/* ISO C9x 7.18 Integer types * Based on ISO/IEC SC22/WG14 9899 Committee draft (SC22 N2794) * * THIS SOFTWARE IS NOT COPYRIGHTED * * Contributor: Danny Smith * * This source code is offered for use in the public domain. You may * use, modify or distribute it freely. * * This code is distributed in the hope that it will be useful but * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY * DISCLAIMED. This includes but is not limited to warranties of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Date: 2000-12-02 */ #ifndef _STDINT_H #define _STDINT_H #define __need_wint_t #define __need_wchar_t #include /* 7.18.1.1 Exact-width integer types */ typedef signed char int8_t; typedef unsigned char uint8_t; typedef short int16_t; typedef unsigned short uint16_t; typedef int int32_t; typedef unsigned uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; /* 7.18.1.2 Minimum-width integer types */ typedef signed char int_least8_t; typedef unsigned char uint_least8_t; typedef short int_least16_t; typedef unsigned short uint_least16_t; typedef int int_least32_t; typedef unsigned uint_least32_t; typedef __int64 int_least64_t; typedef unsigned __int64 uint_least64_t; /* 7.18.1.3 Fastest minimum-width integer types * Not actually guaranteed to be fastest for all purposes * Here we use the exact-width types for 8 and 16-bit ints. */ typedef char int_fast8_t; typedef unsigned char uint_fast8_t; typedef short int_fast16_t; typedef unsigned short uint_fast16_t; typedef int int_fast32_t; typedef unsigned int uint_fast32_t; typedef __int64 int_fast64_t; typedef unsigned __int64 uint_fast64_t; /* 7.18.1.4 Integer types capable of holding object pointers */ /*typedef int intptr_t; typedef unsigned uintptr_t;*/ /* 7.18.1.5 Greatest-width integer types */ typedef __int64 intmax_t; typedef unsigned __int64 uintmax_t; /* 7.18.2 Limits of specified-width integer types */ #if !defined ( __cplusplus) || defined (__STDC_LIMIT_MACROS) /* 7.18.2.1 Limits of exact-width integer types */ #define INT8_MIN (-128) #define INT16_MIN (-32768) #define INT32_MIN (-2147483647 - 1) #define INT64_MIN (-9223372036854775807LL - 1) #define INT8_MAX 127 #define INT16_MAX 32767 #define INT32_MAX 2147483647 #define INT64_MAX 9223372036854775807LL #define UINT8_MAX 0xff /* 255U */ #define UINT16_MAX 0xffff /* 65535U */ #define UINT32_MAX 0xffffffff /* 4294967295U */ #define UINT64_MAX 0xffffffffffffffffULL /* 18446744073709551615ULL */ /* 7.18.2.2 Limits of minimum-width integer types */ #define INT_LEAST8_MIN INT8_MIN #define INT_LEAST16_MIN INT16_MIN #define INT_LEAST32_MIN INT32_MIN #define INT_LEAST64_MIN INT64_MIN #define INT_LEAST8_MAX INT8_MAX #define INT_LEAST16_MAX INT16_MAX #define INT_LEAST32_MAX INT32_MAX #define INT_LEAST64_MAX INT64_MAX #define UINT_LEAST8_MAX UINT8_MAX #define UINT_LEAST16_MAX UINT16_MAX #define UINT_LEAST32_MAX UINT32_MAX #define UINT_LEAST64_MAX UINT64_MAX /* 7.18.2.3 Limits of fastest minimum-width integer types */ #define INT_FAST8_MIN INT8_MIN #define INT_FAST16_MIN INT16_MIN #define INT_FAST32_MIN INT32_MIN #define INT_FAST64_MIN INT64_MIN #define INT_FAST8_MAX INT8_MAX #define INT_FAST16_MAX INT16_MAX #define INT_FAST32_MAX INT32_MAX #define INT_FAST64_MAX INT64_MAX #define UINT_FAST8_MAX UINT8_MAX #define UINT_FAST16_MAX UINT16_MAX #define UINT_FAST32_MAX UINT32_MAX #define UINT_FAST64_MAX UINT64_MAX /* 7.18.2.4 Limits of integer types capable of holding object pointers */ #if defined(_WIN64) || defined(__LP64__) #define INTPTR_MIN INT64_MIN #define INTPTR_MAX INT64_MAX #define UINTPTR_MAX UINT64_MAX #else #define INTPTR_MIN INT32_MIN #define INTPTR_MAX INT32_MAX #define UINTPTR_MAX UINT32_MAX #endif /* 7.18.2.5 Limits of greatest-width integer types */ #define INTMAX_MIN INT64_MIN #define INTMAX_MAX INT64_MAX #define UINTMAX_MAX UINT64_MAX /* 7.18.3 Limits of other integer types */ #if defined(_WIN64) || defined(__LP64__) #define PTRDIFF_MIN INT64_MIN #define PTRDIFF_MAX INT64_MAX #else #define PTRDIFF_MIN INT32_MIN #define PTRDIFF_MAX INT32_MAX #endif #define SIG_ATOMIC_MIN INT32_MIN #define SIG_ATOMIC_MAX INT32_MAX #ifndef SIZE_MAX #if defined(_WIN64) || defined(__LP64__) #define SIZE_MAX UINT64_MAX #else #define SIZE_MAX UINT32_MAX #endif #endif #ifndef WCHAR_MIN /* also in wchar.h */ #define WCHAR_MIN 0 #define WCHAR_MAX ((wchar_t)-1) /* UINT16_MAX */ #endif /* * wint_t is unsigned short for compatibility with MS runtime */ #define WINT_MIN 0 #define WINT_MAX ((wint_t)-1) /* UINT16_MAX */ #endif /* !defined ( __cplusplus) || defined __STDC_LIMIT_MACROS */ /* 7.18.4 Macros for integer constants */ #if !defined ( __cplusplus) || defined (__STDC_CONSTANT_MACROS) /* 7.18.4.1 Macros for minimum-width integer constants Accoding to Douglas Gwyn : "This spec was changed in ISO/IEC 9899:1999 TC1; in ISO/IEC 9899:1999 as initially published, the expansion was required to be an integer constant of precisely matching type, which is impossible to accomplish for the shorter types on most platforms, because C99 provides no standard way to designate an integer constant with width less than that of type int. TC1 changed this to require just an integer constant *expression* with *promoted* type." The trick used here is from Clive D W Feather. */ #define INT8_C(val) (INT_LEAST8_MAX-INT_LEAST8_MAX+(val)) #define INT16_C(val) (INT_LEAST16_MAX-INT_LEAST16_MAX+(val)) #define INT32_C(val) (INT_LEAST32_MAX-INT_LEAST32_MAX+(val)) #define INT64_C(val) (INT_LEAST64_MAX-INT_LEAST64_MAX+(val)) #define UINT8_C(val) (UINT_LEAST8_MAX-UINT_LEAST8_MAX+(val)) #define UINT16_C(val) (UINT_LEAST16_MAX-UINT_LEAST16_MAX+(val)) #define UINT32_C(val) (UINT_LEAST32_MAX-UINT_LEAST32_MAX+(val)) #define UINT64_C(val) (UINT_LEAST64_MAX-UINT_LEAST64_MAX+(val)) /* 7.18.4.2 Macros for greatest-width integer constants */ #define INTMAX_C(val) (INTMAX_MAX-INTMAX_MAX+(val)) #define UINTMAX_C(val) (UINTMAX_MAX-UINTMAX_MAX+(val)) #endif /* !defined ( __cplusplus) || defined __STDC_CONSTANT_MACROS */ #endif x264-master/filters/000077500000000000000000000000001502133446700145275ustar00rootroot00000000000000x264-master/filters/filters.c000066400000000000000000000105311502133446700163430ustar00rootroot00000000000000/***************************************************************************** * filters.c: common filter functions ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Diogo Franco * Steven Walters * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "filters.h" #define RETURN_IF_ERROR( cond, ... ) RETURN_IF_ERR( cond, "options", NULL, __VA_ARGS__ ) char **x264_split_options( const char *opt_str, const char * const *options ) { int opt_count = 0, options_count = 0, found_named = 0; size_t size = 0; const char *opt = opt_str; if( !opt_str ) return NULL; while( options[options_count] ) options_count++; do { size_t length = strcspn( opt, "=," ); if( opt[length] == '=' ) { const char * const *option = options; while( *option && (strlen( *option ) != length || strncmp( opt, *option, length )) ) option++; RETURN_IF_ERROR( !*option, "Invalid option '%.*s'\n", length, opt ); found_named = 1; length += strcspn( opt + length, "," ); } else { RETURN_IF_ERROR( opt_count >= options_count, "Too many options given\n" ); RETURN_IF_ERROR( found_named, "Ordered option given after named\n" ); size += strlen( options[opt_count] ) + 1; } opt_count++; opt += length; } while( *opt++ ); size_t offset = 2 * (opt_count+1) * sizeof(char*); size += offset + (opt - opt_str); char **opts = calloc( 1, size ); RETURN_IF_ERROR( !opts, "malloc failed\n" ); #define insert_opt( src, length )\ do {\ opts[i++] = memcpy( (char*)opts + offset, src, length );\ offset += length + 1;\ src += length + 1;\ } while( 0 ) for( int i = 0; i < 2*opt_count; ) { size_t length = strcspn( opt_str, "=," ); if( opt_str[length] == '=' ) { insert_opt( opt_str, length ); length = strcspn( opt_str, "," ); } else { const char *option = options[i/2]; size_t option_length = strlen( option ); insert_opt( option, option_length ); } insert_opt( opt_str, length ); } assert( offset == size ); return opts; } char *x264_get_option( const char *name, char **split_options ) { if( split_options ) { int last_i = -1; for( int i = 0; split_options[i]; i += 2 ) if( !strcmp( split_options[i], name ) ) last_i = i; if( last_i >= 0 && split_options[last_i+1][0] ) return split_options[last_i+1]; } return NULL; } int x264_otob( const char *str, int def ) { if( str ) return !strcasecmp( str, "true" ) || !strcmp( str, "1" ) || !strcasecmp( str, "yes" ); return def; } double x264_otof( const char *str, double def ) { double ret = def; if( str ) { char *end; ret = strtod( str, &end ); if( end == str || *end != '\0' ) ret = def; } return ret; } int x264_otoi( const char *str, int def ) { int ret = def; if( str ) { char *end; ret = strtol( str, &end, 0 ); if( end == str || *end != '\0' ) ret = def; } return ret; } char *x264_otos( char *str, char *def ) { return str ? str : def; } x264-master/filters/filters.h000066400000000000000000000034121502133446700163500ustar00rootroot00000000000000/***************************************************************************** * filters.h: common filter functions ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Diogo Franco * Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_FILTERS_H #define X264_FILTERS_H #include "x264cli.h" #include "filters/video/video.h" char **x264_split_options( const char *opt_str, const char * const *options ); char *x264_get_option( const char *name, char **split_options ); int x264_otob( const char *str, int def ); // option to bool double x264_otof( const char *str, double def ); // option to float/double int x264_otoi( const char *str, int def ); // option to int char *x264_otos( char *str, char *def ); // option to string #endif x264-master/filters/video/000077500000000000000000000000001502133446700156355ustar00rootroot00000000000000x264-master/filters/video/cache.c000066400000000000000000000121541502133446700170470ustar00rootroot00000000000000/***************************************************************************** * cache.c: cache video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" #include "internal.h" #include "common/common.h" #define cache_filter x264_glue3(cache, BIT_DEPTH, filter) #if BIT_DEPTH == 8 #define NAME "cache_8" #else #define NAME "cache_10" #endif #define LAST_FRAME (h->first_frame + h->cur_size - 1) typedef struct { hnd_t prev_hnd; cli_vid_filter_t prev_filter; int max_size; int first_frame; /* first cached frame */ cli_pic_t **cache; int cur_size; int eof; /* frame beyond end of the file */ } cache_hnd_t; cli_vid_filter_t cache_filter; static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { intptr_t size = (intptr_t)opt_string; /* upon a <= 0 cache request, do nothing */ if( size <= 0 ) return 0; cache_hnd_t *h = calloc( 1, sizeof(cache_hnd_t) ); if( !h ) return -1; h->max_size = size; h->cache = malloc( (h->max_size+1) * sizeof(cli_pic_t*) ); if( !h->cache ) return -1; for( int i = 0; i < h->max_size; i++ ) { h->cache[i] = malloc( sizeof(cli_pic_t) ); if( !h->cache[i] || x264_cli_pic_alloc( h->cache[i], info->csp, info->width, info->height ) ) return -1; } h->cache[h->max_size] = NULL; /* require null terminator for list methods */ h->prev_filter = *filter; h->prev_hnd = *handle; *handle = h; *filter = cache_filter; return 0; } static void fill_cache( cache_hnd_t *h, int frame ) { /* shift frames out of the cache as the frame request is beyond the filled cache */ int shift = frame - LAST_FRAME; /* no frames to shift or no frames left to read */ if( shift <= 0 || h->eof ) return; /* the next frames to read are either * A) starting at the end of the current cache, or * B) starting at a new frame that has the end of the cache at the desired frame * and proceeding to fill the entire cache */ int cur_frame = X264_MAX( h->first_frame + h->cur_size, frame - h->max_size + 1 ); /* the new starting point is either * A) the current one shifted the number of frames entering/leaving the cache, or * B) at a new frame that has the end of the cache at the desired frame. */ h->first_frame = X264_MIN( h->first_frame + shift, cur_frame ); h->cur_size = X264_MAX( h->cur_size - shift, 0 ); while( h->cur_size < h->max_size ) { cli_pic_t temp; /* the old front frame is going to shift off, overwrite it with the new frame */ cli_pic_t *cache = h->cache[0]; if( h->prev_filter.get_frame( h->prev_hnd, &temp, cur_frame ) || x264_cli_pic_copy( cache, &temp ) || h->prev_filter.release_frame( h->prev_hnd, &temp, cur_frame ) ) { h->eof = cur_frame; return; } /* the read was successful, shift the frame off the front to the end */ x264_frame_push( (void*)h->cache, x264_frame_shift( (void*)h->cache ) ); cur_frame++; h->cur_size++; } } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { cache_hnd_t *h = handle; FAIL_IF_ERR( frame < h->first_frame, NAME, "frame %d is before first cached frame %d \n", frame, h->first_frame ); fill_cache( h, frame ); if( frame > LAST_FRAME ) /* eof */ return -1; int idx = frame - (h->eof ? h->eof - h->max_size : h->first_frame); *output = *h->cache[idx]; return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { /* the parent filter's frame has already been released so do nothing here */ return 0; } static void free_filter( hnd_t handle ) { cache_hnd_t *h = handle; h->prev_filter.free( h->prev_hnd ); for( int i = 0; i < h->max_size; i++ ) { x264_cli_pic_clean( h->cache[i] ); free( h->cache[i] ); } free( h->cache ); free( h ); } cli_vid_filter_t cache_filter = { NAME, NULL, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/crop.c000066400000000000000000000115251502133446700167500ustar00rootroot00000000000000/***************************************************************************** * crop.c: crop video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * James Darnley * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" #define NAME "crop" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ ) cli_vid_filter_t crop_filter; typedef struct { hnd_t prev_hnd; cli_vid_filter_t prev_filter; int dims[4]; /* left, top, width, height */ const x264_cli_csp_t *csp; } crop_hnd_t; static void help( int longhelp ) { printf( " "NAME":left,top,right,bottom\n" ); if( !longhelp ) return; printf( " removes pixels from the edges of the frame\n" ); } static int handle_opts( crop_hnd_t *h, video_info_t *info, char **opts, const char * const *optlist ) { for( int i = 0; i < 4; i++ ) { char *opt = x264_get_option( optlist[i], opts ); FAIL_IF_ERROR( !opt, "%s crop value not specified\n", optlist[i] ); h->dims[i] = x264_otoi( opt, -1 ); FAIL_IF_ERROR( h->dims[i] < 0, "%s crop value `%s' is less than 0\n", optlist[i], opt ); int dim_mod = i&1 ? (h->csp->mod_height << info->interlaced) : h->csp->mod_width; FAIL_IF_ERROR( h->dims[i] % dim_mod, "%s crop value `%s' is not a multiple of %d\n", optlist[i], opt, dim_mod ); } return 0; } static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { FAIL_IF_ERROR( x264_cli_csp_is_invalid( info->csp ), "invalid csp %d\n", info->csp ); crop_hnd_t *h = calloc( 1, sizeof(crop_hnd_t) ); if( !h ) return -1; h->csp = x264_cli_get_csp( info->csp ); static const char * const optlist[] = { "left", "top", "right", "bottom", NULL }; char **opts = x264_split_options( opt_string, optlist ); if( !opts ) return -1; int err = handle_opts( h, info, opts, optlist ); free( opts ); if( err ) return -1; h->dims[2] = info->width - h->dims[0] - h->dims[2]; h->dims[3] = info->height - h->dims[1] - h->dims[3]; FAIL_IF_ERROR( h->dims[2] <= 0 || h->dims[3] <= 0, "invalid output resolution %dx%d\n", h->dims[2], h->dims[3] ); if( info->width != h->dims[2] || info->height != h->dims[3] ) x264_cli_log( NAME, X264_LOG_INFO, "cropping to %dx%d\n", h->dims[2], h->dims[3] ); else { /* do nothing as the user supplied 0s for all the values */ free( h ); return 0; } /* done initializing, overwrite values */ info->width = h->dims[2]; info->height = h->dims[3]; h->prev_filter = *filter; h->prev_hnd = *handle; *handle = h; *filter = crop_filter; return 0; } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { crop_hnd_t *h = handle; if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) ) return -1; output->img.width = h->dims[2]; output->img.height = h->dims[3]; /* shift the plane pointers down 'top' rows and right 'left' columns. */ for( int i = 0; i < output->img.planes; i++ ) { intptr_t offset = output->img.stride[i] * h->dims[1] * h->csp->height[i]; offset += h->dims[0] * h->csp->width[i] * x264_cli_csp_depth_factor( output->img.csp ); output->img.plane[i] += offset; } return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { crop_hnd_t *h = handle; /* NO filter should ever have a dependent release based on the plane pointers, * so avoid unnecessary unshifting */ return h->prev_filter.release_frame( h->prev_hnd, pic, frame ); } static void free_filter( hnd_t handle ) { crop_hnd_t *h = handle; h->prev_filter.free( h->prev_hnd ); free( h ); } cli_vid_filter_t crop_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/depth.c000066400000000000000000000207551502133446700171160ustar00rootroot00000000000000/***************************************************************************** * depth.c: bit-depth conversion video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Oskar Arvidsson * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" #include "common/common.h" #define depth_filter x264_glue3(depth, BIT_DEPTH, filter) #if BIT_DEPTH == 8 #define NAME "depth_8" #else #define NAME "depth_10" #endif #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ ) cli_vid_filter_t depth_filter; typedef struct { hnd_t prev_hnd; cli_vid_filter_t prev_filter; int bit_depth; int dst_csp; cli_pic_t buffer; int16_t *error_buf; } depth_hnd_t; static int depth_filter_csp_is_supported( int csp ) { int csp_mask = csp & X264_CSP_MASK; return csp_mask == X264_CSP_I400 || csp_mask == X264_CSP_I420 || csp_mask == X264_CSP_I422 || csp_mask == X264_CSP_I444 || csp_mask == X264_CSP_YV12 || csp_mask == X264_CSP_YV16 || csp_mask == X264_CSP_YV24 || csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16 || csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB || csp_mask == X264_CSP_BGRA; } static int csp_num_interleaved( int csp, int plane ) { int csp_mask = csp & X264_CSP_MASK; return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 : csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 : csp_mask == X264_CSP_BGRA ? 4 : 1; } /* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been * written in such a way so that if the source has been upconverted using the * same algorithm as used in scale_image, dithering down to the source bit * depth again is lossless. */ #define DITHER_PLANE( pitch ) \ static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int src_stride, \ int width, int height, int16_t *errors ) \ { \ const int lshift = 16-BIT_DEPTH; \ const int rshift = 16-BIT_DEPTH+2; \ const int half = 1 << (16-BIT_DEPTH+1); \ const int pixel_max = (1 << BIT_DEPTH)-1; \ memset( errors, 0, (width+1) * sizeof(int16_t) ); \ for( int y = 0; y < height; y++, src += src_stride, dst += dst_stride ) \ { \ int err = 0; \ for( int x = 0; x < width; x++ ) \ { \ err = err*2 + errors[x] + errors[x+1]; \ dst[x*pitch] = x264_clip3( ((src[x*pitch]<<2)+err+half) >> rshift, 0, pixel_max ); \ errors[x] = err = src[x*pitch] - (dst[x*pitch] << lshift); \ } \ } \ } DITHER_PLANE( 1 ) DITHER_PLANE( 2 ) DITHER_PLANE( 3 ) DITHER_PLANE( 4 ) static void dither_image( cli_image_t *out, cli_image_t *img, int16_t *error_buf ) { int csp_mask = img->csp & X264_CSP_MASK; for( int i = 0; i < img->planes; i++ ) { int num_interleaved = csp_num_interleaved( img->csp, i ); int height = x264_cli_csps[csp_mask].height[i] * img->height; int width = x264_cli_csps[csp_mask].width[i] * img->width / num_interleaved; #define CALL_DITHER_PLANE( pitch, off ) \ dither_plane_##pitch( ((pixel*)out->plane[i])+off, out->stride[i]/SIZEOF_PIXEL, \ ((uint16_t*)img->plane[i])+off, img->stride[i]/2, width, height, error_buf ) if( num_interleaved == 4 ) { CALL_DITHER_PLANE( 4, 0 ); CALL_DITHER_PLANE( 4, 1 ); CALL_DITHER_PLANE( 4, 2 ); CALL_DITHER_PLANE( 4, 3 ); //we probably can skip this one } else if( num_interleaved == 3 ) { CALL_DITHER_PLANE( 3, 0 ); CALL_DITHER_PLANE( 3, 1 ); CALL_DITHER_PLANE( 3, 2 ); } else if( num_interleaved == 2 ) { CALL_DITHER_PLANE( 2, 0 ); CALL_DITHER_PLANE( 2, 1 ); } else //if( num_interleaved == 1 ) { CALL_DITHER_PLANE( 1, 0 ); } } } static void scale_image( cli_image_t *output, cli_image_t *img ) { int csp_mask = img->csp & X264_CSP_MASK; const int shift = BIT_DEPTH - 8; for( int i = 0; i < img->planes; i++ ) { uint8_t *src = img->plane[i]; uint16_t *dst = (uint16_t*)output->plane[i]; int height = x264_cli_csps[csp_mask].height[i] * img->height; int width = x264_cli_csps[csp_mask].width[i] * img->width; for( int j = 0; j < height; j++ ) { for( int k = 0; k < width; k++ ) dst[k] = src[k] << shift; src += img->stride[i]; dst += output->stride[i]/2; } } } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { depth_hnd_t *h = handle; if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) ) return -1; if( h->bit_depth < 16 && output->img.csp & X264_CSP_HIGH_DEPTH ) { dither_image( &h->buffer.img, &output->img, h->error_buf ); output->img = h->buffer.img; } else if( h->bit_depth > 8 && !(output->img.csp & X264_CSP_HIGH_DEPTH) ) { scale_image( &h->buffer.img, &output->img ); output->img = h->buffer.img; } return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { depth_hnd_t *h = handle; return h->prev_filter.release_frame( h->prev_hnd, pic, frame ); } static void free_filter( hnd_t handle ) { depth_hnd_t *h = handle; h->prev_filter.free( h->prev_hnd ); x264_cli_pic_clean( &h->buffer ); x264_free( h ); } static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { int ret = 0; int change_fmt = (info->csp ^ param->i_csp) & X264_CSP_HIGH_DEPTH; int csp = ~(~info->csp ^ change_fmt); int bit_depth = 8*x264_cli_csp_depth_factor( csp ); if( opt_string ) { static const char * const optlist[] = { "bit_depth", NULL }; char **opts = x264_split_options( opt_string, optlist ); if( opts ) { char *str_bit_depth = x264_get_option( "bit_depth", opts ); bit_depth = x264_otoi( str_bit_depth, -1 ); ret = bit_depth < 8 || bit_depth > 16; csp = bit_depth > 8 ? csp | X264_CSP_HIGH_DEPTH : csp & ~X264_CSP_HIGH_DEPTH; change_fmt = (info->csp ^ csp) & X264_CSP_HIGH_DEPTH; free( opts ); } else ret = 1; } FAIL_IF_ERROR( bit_depth != BIT_DEPTH, "this filter supports only bit depth %d\n", BIT_DEPTH ); FAIL_IF_ERROR( ret, "unsupported bit depth conversion.\n" ); /* only add the filter to the chain if it's needed */ if( change_fmt || bit_depth != 8 * x264_cli_csp_depth_factor( csp ) ) { FAIL_IF_ERROR( !depth_filter_csp_is_supported(csp), "unsupported colorspace.\n" ); depth_hnd_t *h = x264_malloc( sizeof(depth_hnd_t) + (info->width+1)*sizeof(int16_t) ); if( !h ) return -1; h->error_buf = (int16_t*)(h + 1); h->dst_csp = csp; h->bit_depth = bit_depth; h->prev_hnd = *handle; h->prev_filter = *filter; if( x264_cli_pic_alloc( &h->buffer, h->dst_csp, info->width, info->height ) ) { x264_free( h ); return -1; } *handle = h; *filter = depth_filter; info->csp = h->dst_csp; } return 0; } cli_vid_filter_t depth_filter = { NAME, NULL, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/fix_vfr_pts.c000066400000000000000000000113451502133446700203360ustar00rootroot00000000000000/***************************************************************************** * fix_vfr_pts.c: vfr pts fixing video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" #include "internal.h" /* This filter calculates and store the frame's duration to the frame data * (if it is not already calculated when the frame arrives to this point) * so it can be used by filters that will need to reconstruct pts due to * out-of-order frame requests */ typedef struct { hnd_t prev_hnd; cli_vid_filter_t prev_filter; /* we need 1 buffer picture and 1 place holder */ cli_pic_t buffer; cli_pic_t holder; int buffer_allocated; int holder_frame; int holder_ret; int64_t pts; int64_t last_duration; } fix_vfr_pts_hnd_t; cli_vid_filter_t fix_vfr_pts_filter; static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { /* if the input is not vfr, we don't do anything */ if( !info->vfr ) return 0; fix_vfr_pts_hnd_t *h = calloc( 1, sizeof(fix_vfr_pts_hnd_t) ); if( !h ) return -1; h->holder_frame = -1; h->prev_hnd = *handle; h->prev_filter = *filter; *handle = h; *filter = fix_vfr_pts_filter; return 0; } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { fix_vfr_pts_hnd_t *h = handle; /* if we want the holder picture and it errored, return the error. */ if( frame == h->holder_frame ) { if( h->holder_ret ) return h->holder_ret; } else { /* if we have a holder frame and we don't want it, release the frame */ if( h->holder_frame > 0 && h->holder_frame < frame && h->prev_filter.release_frame( h->prev_hnd, &h->holder, h->holder_frame ) ) return -1; h->holder_frame = -1; if( h->prev_filter.get_frame( h->prev_hnd, &h->holder, frame ) ) return -1; } /* if the frame's duration is not set already, read the next frame to set it. */ if( !h->holder.duration ) { /* allocate a buffer picture if we didn't already */ if( !h->buffer_allocated ) { if( x264_cli_pic_alloc( &h->buffer, h->holder.img.csp, h->holder.img.width, h->holder.img.height ) ) return -1; h->buffer_allocated = 1; } h->holder_frame = frame+1; /* copy the current frame to the buffer, release it, and then read in the next frame to the placeholder */ if( x264_cli_pic_copy( &h->buffer, &h->holder ) || h->prev_filter.release_frame( h->prev_hnd, &h->holder, frame ) ) return -1; h->holder_ret = h->prev_filter.get_frame( h->prev_hnd, &h->holder, h->holder_frame ); /* suppress non-monotonic pts warnings by setting the duration to be at least 1 */ if( !h->holder_ret ) h->last_duration = X264_MAX( h->holder.pts - h->buffer.pts, 1 ); h->buffer.duration = h->last_duration; *output = h->buffer; } else *output = h->holder; output->pts = h->pts; h->pts += output->duration; return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { fix_vfr_pts_hnd_t *h = handle; /* if the frame is the buffered one, it's already been released */ if( frame == (h->holder_frame - 1) ) return 0; return h->prev_filter.release_frame( h->prev_hnd, pic, frame ); } static void free_filter( hnd_t handle ) { fix_vfr_pts_hnd_t *h = handle; h->prev_filter.free( h->prev_hnd ); if( h->buffer_allocated ) x264_cli_pic_clean( &h->buffer ); free( h ); } cli_vid_filter_t fix_vfr_pts_filter = { "fix_vfr_pts", NULL, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/internal.c000066400000000000000000000045731502133446700176260ustar00rootroot00000000000000/***************************************************************************** * internal.c: video filter utilities ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "internal.h" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ ) void x264_cli_plane_copy( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h ) { while( h-- ) { memcpy( dst, src, w ); dst += i_dst; src += i_src; } } int x264_cli_pic_copy( cli_pic_t *out, cli_pic_t *in ) { int csp = in->img.csp & X264_CSP_MASK; FAIL_IF_ERROR( x264_cli_csp_is_invalid( in->img.csp ), "invalid colorspace arg %d\n", in->img.csp ); FAIL_IF_ERROR( in->img.csp != out->img.csp || in->img.height != out->img.height || in->img.width != out->img.width, "incompatible frame properties\n" ); /* copy data */ out->duration = in->duration; out->pts = in->pts; out->opaque = in->opaque; for( int i = 0; i < out->img.planes; i++ ) { int height = in->img.height * x264_cli_csps[csp].height[i]; int width = in->img.width * x264_cli_csps[csp].width[i]; width *= x264_cli_csp_depth_factor( in->img.csp ); x264_cli_plane_copy( out->img.plane[i], out->img.stride[i], in->img.plane[i], in->img.stride[i], width, height ); } return 0; } x264-master/filters/video/internal.h000066400000000000000000000027001502133446700176210ustar00rootroot00000000000000/***************************************************************************** * internal.h: video filter utilities ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_FILTER_VIDEO_INTERNAL_H #define X264_FILTER_VIDEO_INTERNAL_H #include "video.h" void x264_cli_plane_copy( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h ); int x264_cli_pic_copy( cli_pic_t *out, cli_pic_t *in ); #endif x264-master/filters/video/resize.c000066400000000000000000000600011502133446700172770ustar00rootroot00000000000000/***************************************************************************** * resize.c: resize video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" #define NAME "resize" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ ) cli_vid_filter_t resize_filter; static int full_check( video_info_t *info, x264_param_t *param ) { int required = 0; required |= info->csp != param->i_csp; required |= info->width != param->i_width; required |= info->height != param->i_height; required |= info->fullrange != param->vui.b_fullrange; return required; } #if HAVE_SWSCALE #undef DECLARE_ALIGNED #include #include #include #ifndef AV_PIX_FMT_BGRA64 #define AV_PIX_FMT_BGRA64 AV_PIX_FMT_NONE #endif typedef struct { int width; int height; int pix_fmt; int range; } frame_prop_t; typedef struct { hnd_t prev_hnd; cli_vid_filter_t prev_filter; cli_pic_t buffer; int buffer_allocated; int dst_csp; int input_range; struct SwsContext *ctx; uint32_t ctx_flags; /* state of swapping chroma planes pre and post resize */ int pre_swap_chroma; int post_swap_chroma; int fast_mono; /* yuv with planar luma can be "converted" to monochrome by simply ignoring chroma */ int variable_input; /* input is capable of changing properties */ int working; /* we have already started working with frames */ frame_prop_t dst; /* desired output properties */ frame_prop_t scale; /* properties of the SwsContext input */ } resizer_hnd_t; static void help( int longhelp ) { printf( " "NAME":[width,height][,sar][,fittobox][,csp][,method]\n" ); if( !longhelp ) return; printf( " resizes frames based on the given criteria:\n" " - resolution only: resizes and adapts sar to avoid stretching\n" " - sar only: sets the sar and resizes to avoid stretching\n" " - resolution and sar: resizes to given resolution and sets the sar\n" " - fittobox: resizes the video based on the desired constraints\n" " - width, height, both\n" " - fittobox and sar: same as above except with specified sar\n" " - csp: convert to the given csp. syntax: [name][:depth]\n" " - valid csp names [keep current]: " ); for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ ) { if( x264_cli_csps[i].name ) { printf( "%s", x264_cli_csps[i].name ); if( i+1 < X264_CSP_CLI_MAX ) printf( ", " ); } } printf( "\n" " - depth: 8 or 16 bits per pixel [keep current]\n" " note: not all depths are supported by all csps.\n" " - method: use resizer method [\"bicubic\"]\n" " - fastbilinear, bilinear, bicubic, experimental, point,\n" " - area, bicublin, gauss, sinc, lanczos, spline\n" ); } static uint32_t convert_method_to_flag( const char *name ) { uint32_t flag = 0; if( !strcasecmp( name, "fastbilinear" ) ) flag = SWS_FAST_BILINEAR; else if( !strcasecmp( name, "bilinear" ) ) flag = SWS_BILINEAR; else if( !strcasecmp( name, "bicubic" ) ) flag = SWS_BICUBIC; else if( !strcasecmp( name, "experimental" ) ) flag = SWS_X; else if( !strcasecmp( name, "point" ) ) flag = SWS_POINT; else if( !strcasecmp( name, "area" ) ) flag = SWS_AREA; else if( !strcasecmp( name, "bicublin" ) ) flag = SWS_BICUBLIN; else if( !strcasecmp( name, "gauss" ) ) flag = SWS_GAUSS; else if( !strcasecmp( name, "sinc" ) ) flag = SWS_SINC; else if( !strcasecmp( name, "lanczos" ) ) flag = SWS_LANCZOS; else if( !strcasecmp( name, "spline" ) ) flag = SWS_SPLINE; else // default flag = SWS_BICUBIC; return flag; } static int convert_csp_to_pix_fmt( int csp ) { if( csp&X264_CSP_OTHER ) return csp&X264_CSP_MASK; switch( csp&X264_CSP_MASK ) { case X264_CSP_I400: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_GRAY16 : AV_PIX_FMT_GRAY8; case X264_CSP_YV12: /* specially handled via swapping chroma */ case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_YUV420P16 : AV_PIX_FMT_YUV420P; case X264_CSP_YV16: /* specially handled via swapping chroma */ case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_YUV422P16 : AV_PIX_FMT_YUV422P; case X264_CSP_YV24: /* specially handled via swapping chroma */ case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_YUV444P16 : AV_PIX_FMT_YUV444P; case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48 : AV_PIX_FMT_RGB24; case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48 : AV_PIX_FMT_BGR24; case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA; /* the following has no equivalent 16-bit depth in swscale */ case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12; case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21; case X264_CSP_YUYV: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_YUYV422; case X264_CSP_UYVY: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_UYVY422; /* the following is not supported by swscale at all */ case X264_CSP_NV16: default: return AV_PIX_FMT_NONE; } } static int pix_number_of_planes( const AVPixFmtDescriptor *pix_desc ) { int num_planes = 0; for( int i = 0; i < pix_desc->nb_components; i++ ) { int plane_plus1 = pix_desc->comp[i].plane + 1; num_planes = X264_MAX( plane_plus1, num_planes ); } return num_planes; } static int pick_closest_supported_csp( int csp ) { int pix_fmt = convert_csp_to_pix_fmt( csp ); // first determine the base csp int ret = X264_CSP_NONE; const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get( pix_fmt ); if( !pix_desc || !pix_desc->name ) return ret; const char *pix_fmt_name = pix_desc->name; int is_rgb = pix_desc->flags & (AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_PAL); int is_bgr = !!strstr( pix_fmt_name, "bgr" ); if( is_bgr || is_rgb ) { if( pix_desc->nb_components == 4 ) // has alpha ret = X264_CSP_BGRA; else if( is_bgr ) ret = X264_CSP_BGR; else ret = X264_CSP_RGB; } else { // yuv-based if( pix_desc->nb_components == 1 || pix_desc->nb_components == 2 ) // no chroma ret = X264_CSP_I400; else if( pix_desc->log2_chroma_w && pix_desc->log2_chroma_h ) // reduced chroma width & height ret = (pix_number_of_planes( pix_desc ) == 2) ? X264_CSP_NV12 : X264_CSP_I420; else if( pix_desc->log2_chroma_w ) // reduced chroma width only ret = X264_CSP_I422; // X264_CSP_NV16 is not supported by swscale so don't use it else ret = X264_CSP_I444; } // now determine high depth for( int i = 0; i < pix_desc->nb_components; i++ ) if( pix_desc->comp[i].depth > 8 ) ret |= X264_CSP_HIGH_DEPTH; return ret; } static int handle_opts( const char * const *optlist, char **opts, video_info_t *info, resizer_hnd_t *h ) { uint32_t out_sar_w, out_sar_h; char *str_width = x264_get_option( optlist[0], opts ); char *str_height = x264_get_option( optlist[1], opts ); char *str_sar = x264_get_option( optlist[2], opts ); char *fittobox = x264_get_option( optlist[3], opts ); char *str_csp = x264_get_option( optlist[4], opts ); int width = x264_otoi( str_width, -1 ); int height = x264_otoi( str_height, -1 ); int csp_only = 0; uint32_t in_sar_w = info->sar_width; uint32_t in_sar_h = info->sar_height; if( str_csp ) { /* output csp was specified, first check if optional depth was provided */ char *str_depth = strchr( str_csp, ':' ); int depth = x264_cli_csp_depth_factor( info->csp ) * 8; if( str_depth ) { /* csp bit depth was specified */ *str_depth++ = '\0'; depth = x264_otoi( str_depth, -1 ); FAIL_IF_ERROR( depth != 8 && depth != 16, "unsupported bit depth %d\n", depth ); } /* now lookup against the list of valid csps */ int csp; if( strlen( str_csp ) == 0 ) csp = info->csp & X264_CSP_MASK; else for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- ) { if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) ) break; } FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp ); h->dst_csp = csp; if( depth == 16 ) h->dst_csp |= X264_CSP_HIGH_DEPTH; } /* if the input sar is currently invalid, set it to 1:1 so it can be used in math */ if( !in_sar_w || !in_sar_h ) in_sar_w = in_sar_h = 1; if( str_sar ) { FAIL_IF_ERROR( 2 != sscanf( str_sar, "%u:%u", &out_sar_w, &out_sar_h ) && 2 != sscanf( str_sar, "%u/%u", &out_sar_w, &out_sar_h ), "invalid sar `%s'\n", str_sar ); } else out_sar_w = out_sar_h = 1; if( fittobox ) { /* resize the video to fit the box as much as possible */ if( !strcasecmp( fittobox, "both" ) ) { FAIL_IF_ERROR( width <= 0 || height <= 0, "invalid box resolution %sx%s\n", x264_otos( str_width, "" ), x264_otos( str_height, "" ) ); } else if( !strcasecmp( fittobox, "width" ) ) { FAIL_IF_ERROR( width <= 0, "invalid box width `%s'\n", x264_otos( str_width, "" ) ); height = INT_MAX; } else if( !strcasecmp( fittobox, "height" ) ) { FAIL_IF_ERROR( height <= 0, "invalid box height `%s'\n", x264_otos( str_height, "" ) ); width = INT_MAX; } else FAIL_IF_ERROR( 1, "invalid fittobox mode `%s'\n", fittobox ); /* maximally fit the new coded resolution to the box */ const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp ); double width_units = (double)info->height * in_sar_h * out_sar_w; double height_units = (double)info->width * in_sar_w * out_sar_h; width = width / csp->mod_width * csp->mod_width; height = height / csp->mod_height * csp->mod_height; if( width * width_units > height * height_units ) { int new_width = round( height * height_units / (width_units * csp->mod_width) ); new_width *= csp->mod_width; width = X264_MIN( new_width, width ); } else { int new_height = round( width * width_units / (height_units * csp->mod_height) ); new_height *= csp->mod_height; height = X264_MIN( new_height, height ); } } else { if( str_width || str_height ) { FAIL_IF_ERROR( width <= 0 || height <= 0, "invalid resolution %sx%s\n", x264_otos( str_width, "" ), x264_otos( str_height, "" ) ); if( !str_sar ) /* res only -> adjust sar */ { /* new_sar = (new_h * old_w * old_sar_w) / (old_h * new_w * old_sar_h) */ uint64_t num = (uint64_t)info->width * height; uint64_t den = (uint64_t)info->height * width; x264_reduce_fraction64( &num, &den ); out_sar_w = num * in_sar_w; out_sar_h = den * in_sar_h; x264_reduce_fraction( &out_sar_w, &out_sar_h ); } } else if( str_sar ) /* sar only -> adjust res */ { const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp ); double width_units = (double)in_sar_h * out_sar_w; double height_units = (double)in_sar_w * out_sar_h; width = info->width; height = info->height; if( width_units > height_units ) // SAR got wider, decrease width { width = round( info->width * height_units / (width_units * csp->mod_width) ); width *= csp->mod_width; } else // SAR got thinner, decrease height { height = round( info->height * width_units / (height_units * csp->mod_height) ); height *= csp->mod_height; } } else /* csp only */ { h->dst.width = info->width; h->dst.height = info->height; csp_only = 1; } } if( !csp_only ) { info->sar_width = out_sar_w; info->sar_height = out_sar_h; h->dst.width = width; h->dst.height = height; } return 0; } static int init_sws_context( resizer_hnd_t *h ) { if( h->ctx ) sws_freeContext( h->ctx ); h->ctx = sws_alloc_context(); if( !h->ctx ) return -1; av_opt_set_int( h->ctx, "sws_flags", h->ctx_flags, 0 ); av_opt_set_int( h->ctx, "dstw", h->dst.width, 0 ); av_opt_set_int( h->ctx, "dsth", h->dst.height, 0 ); av_opt_set_int( h->ctx, "dst_format", h->dst.pix_fmt, 0 ); av_opt_set_int( h->ctx, "dst_range", h->dst.range, 0 ); av_opt_set_int( h->ctx, "srcw", h->scale.width, 0 ); av_opt_set_int( h->ctx, "srch", h->scale.height, 0 ); av_opt_set_int( h->ctx, "src_format", h->scale.pix_fmt, 0 ); av_opt_set_int( h->ctx, "src_range", h->scale.range, 0 ); /* FIXME: use the correct matrix coefficients (only YUV -> RGB conversions are supported) */ sws_setColorspaceDetails( h->ctx, sws_getCoefficients( SWS_CS_DEFAULT ), h->scale.range, sws_getCoefficients( SWS_CS_DEFAULT ), h->dst.range, 0, 1<<16, 1<<16 ); return sws_init_context( h->ctx, NULL, NULL ) < 0; } static int check_resizer( resizer_hnd_t *h, cli_pic_t *in ) { frame_prop_t input_prop = { in->img.width, in->img.height, convert_csp_to_pix_fmt( in->img.csp ), h->input_range }; if( !memcmp( &input_prop, &h->scale, sizeof(frame_prop_t) ) ) return 0; /* also warn if the resizer was initialized after the first frame */ if( h->ctx || h->working ) { x264_cli_log( NAME, X264_LOG_WARNING, "stream properties changed at pts %"PRId64"\n", in->pts ); h->fast_mono = 0; } h->scale = input_prop; if( !h->buffer_allocated && !h->fast_mono ) { if( x264_cli_pic_alloc_aligned( &h->buffer, h->dst_csp, h->dst.width, h->dst.height ) ) return -1; h->buffer_allocated = 1; } FAIL_IF_ERROR( init_sws_context( h ), "swscale init failed\n" ); return 0; } static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { /* if called for normalizing the csp to known formats and the format is not unknown, exit */ if( opt_string && !strcmp( opt_string, "normcsp" ) && !(info->csp&X264_CSP_OTHER) ) return 0; /* if called by x264cli and nothing needs to be done, exit */ if( !opt_string && !full_check( info, param ) ) return 0; static const char * const optlist[] = { "width", "height", "sar", "fittobox", "csp", "method", NULL }; char **opts = x264_split_options( opt_string, optlist ); if( !opts && opt_string ) return -1; resizer_hnd_t *h = calloc( 1, sizeof(resizer_hnd_t) ); if( !h ) return -1; h->ctx_flags = convert_method_to_flag( x264_otos( x264_get_option( optlist[5], opts ), "" ) ); if( opts ) { h->dst_csp = info->csp; h->dst.width = info->width; h->dst.height = info->height; h->dst.range = info->fullrange; // maintain input range if( !strcmp( opt_string, "normcsp" ) ) { free( opts ); /* only in normalization scenarios is the input capable of changing properties */ h->variable_input = 1; h->dst_csp = pick_closest_supported_csp( info->csp ); FAIL_IF_ERROR( h->dst_csp == X264_CSP_NONE, "filter get invalid input pixel format %d (colorspace %d)\n", convert_csp_to_pix_fmt( info->csp ), info->csp ); } else { int err = handle_opts( optlist, opts, info, h ); free( opts ); if( err ) return -1; } } else { h->dst_csp = param->i_csp; h->dst.width = param->i_width; h->dst.height = param->i_height; h->dst.range = param->vui.b_fullrange; // change to libx264's range } if( h->ctx_flags != SWS_FAST_BILINEAR ) h->ctx_flags |= SWS_FULL_CHR_H_INT | SWS_FULL_CHR_H_INP | SWS_ACCURATE_RND; h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp ); h->scale = h->dst; h->input_range = info->fullrange; /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */ int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER); int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER); h->pre_swap_chroma = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24; h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24; int src_pix_fmt = convert_csp_to_pix_fmt( info->csp ); int src_pix_fmt_inv = convert_csp_to_pix_fmt( info->csp ^ X264_CSP_HIGH_DEPTH ); int dst_pix_fmt_inv = convert_csp_to_pix_fmt( h->dst_csp ^ X264_CSP_HIGH_DEPTH ); FAIL_IF_ERROR( h->dst.width <= 0 || h->dst.height <= 0 || h->dst.width > MAX_RESOLUTION || h->dst.height > MAX_RESOLUTION, "invalid width x height (%dx%d)\n", h->dst.width, h->dst.height ); /* confirm swscale can support this conversion */ FAIL_IF_ERROR( src_pix_fmt == AV_PIX_FMT_NONE && src_pix_fmt_inv != AV_PIX_FMT_NONE, "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( src_pix_fmt_inv ), info->csp & X264_CSP_HIGH_DEPTH ? 16 : 8 ); FAIL_IF_ERROR( !sws_isSupportedInput( src_pix_fmt ), "input colorspace %s is not supported\n", av_get_pix_fmt_name( src_pix_fmt ) ); FAIL_IF_ERROR( h->dst.pix_fmt == AV_PIX_FMT_NONE && dst_pix_fmt_inv != AV_PIX_FMT_NONE, "input colorspace %s with bit depth %d is not supported\n", av_get_pix_fmt_name( dst_pix_fmt_inv ), h->dst_csp & X264_CSP_HIGH_DEPTH ? 16 : 8 ); FAIL_IF_ERROR( !sws_isSupportedOutput( h->dst.pix_fmt ), "output colorspace %s is not supported\n", av_get_pix_fmt_name( h->dst.pix_fmt ) ); FAIL_IF_ERROR( h->dst.height != info->height && info->interlaced, "swscale is not compatible with interlaced vertical resizing\n" ); /* confirm that the desired resolution meets the colorspace requirements */ const x264_cli_csp_t *csp = x264_cli_get_csp( h->dst_csp ); FAIL_IF_ERROR( h->dst.width % csp->mod_width || h->dst.height % csp->mod_height, "resolution %dx%d is not compliant with colorspace %s\n", h->dst.width, h->dst.height, csp->name ); if( h->dst.width != info->width || h->dst.height != info->height ) x264_cli_log( NAME, X264_LOG_INFO, "resizing to %dx%d\n", h->dst.width, h->dst.height ); if( h->dst.pix_fmt != src_pix_fmt ) x264_cli_log( NAME, X264_LOG_WARNING, "converting from %s to %s\n", av_get_pix_fmt_name( src_pix_fmt ), av_get_pix_fmt_name( h->dst.pix_fmt ) ); else if( h->dst.range != h->input_range ) x264_cli_log( NAME, X264_LOG_WARNING, "converting range from %s to %s\n", h->input_range ? "PC" : "TV", h->dst.range ? "PC" : "TV" ); h->dst_csp |= info->csp & X264_CSP_VFLIP; // preserve vflip if( dst_csp == X264_CSP_I400 && ((src_csp >= X264_CSP_I420 && src_csp <= X264_CSP_NV16) || src_csp == X264_CSP_I444 || src_csp == X264_CSP_YV24) && h->dst.width == info->width && h->dst.height == info->height && h->dst.range == h->input_range ) h->fast_mono = 1; /* use the input luma plane as is */ /* if the input is not variable, initialize the context */ if( !h->variable_input ) { cli_pic_t input_pic = {{info->csp, info->width, info->height, 0}, 0}; if( check_resizer( h, &input_pic ) ) return -1; } /* finished initing, overwrite values */ info->csp = h->dst_csp; info->width = h->dst.width; info->height = h->dst.height; info->fullrange = h->dst.range; h->prev_filter = *filter; h->prev_hnd = *handle; *handle = h; *filter = resize_filter; return 0; } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { resizer_hnd_t *h = handle; if( h->prev_filter.get_frame( h->prev_hnd, output, frame ) ) return -1; if( h->variable_input && check_resizer( h, output ) ) return -1; h->working = 1; if( h->pre_swap_chroma ) XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] ); if( h->ctx && !h->fast_mono ) { sws_scale( h->ctx, (const uint8_t* const*)output->img.plane, output->img.stride, 0, output->img.height, h->buffer.img.plane, h->buffer.img.stride ); output->img = h->buffer.img; /* copy img data */ } else output->img.csp = h->dst_csp; if( h->post_swap_chroma ) XCHG( uint8_t*, output->img.plane[1], output->img.plane[2] ); return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { resizer_hnd_t *h = handle; return h->prev_filter.release_frame( h->prev_hnd, pic, frame ); } static void free_filter( hnd_t handle ) { resizer_hnd_t *h = handle; h->prev_filter.free( h->prev_hnd ); if( h->ctx ) sws_freeContext( h->ctx ); if( h->buffer_allocated ) x264_cli_pic_clean( &h->buffer ); free( h ); } #else /* no swscale */ static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { int ret = 0; if( !opt_string ) ret = full_check( info, param ); else { if( !strcmp( opt_string, "normcsp" ) ) ret = info->csp & X264_CSP_OTHER; else ret = -1; } /* pass if nothing needs to be done, otherwise fail */ FAIL_IF_ERROR( ret, "not compiled with swscale support\n" ); return 0; } #define help NULL #define get_frame NULL #define release_frame NULL #define free_filter NULL #define convert_csp_to_pix_fmt(x) (x & X264_CSP_MASK) #endif cli_vid_filter_t resize_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/select_every.c000066400000000000000000000124521502133446700204760ustar00rootroot00000000000000/***************************************************************************** * select_every.c: select-every video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" #define NAME "select_every" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, NAME, __VA_ARGS__ ) #define MAX_PATTERN_SIZE 100 /* arbitrary */ typedef struct { hnd_t prev_hnd; cli_vid_filter_t prev_filter; int *pattern; int pattern_len; int step_size; int vfr; int64_t pts; } selvry_hnd_t; cli_vid_filter_t select_every_filter; static void help( int longhelp ) { printf( " "NAME":step,offset1[,...]\n" ); if( !longhelp ) return; printf( " apply a selection pattern to input frames\n" " step: the number of frames in the pattern\n" " offsets: the offset into the step to select a frame\n" " see: http://avisynth.nl/index.php/Select#SelectEvery\n" ); } static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { selvry_hnd_t *h = malloc( sizeof(selvry_hnd_t) ); if( !h ) return -1; h->pattern_len = 0; h->step_size = 0; int offsets[MAX_PATTERN_SIZE]; for( char *tok, *p = opt_string, UNUSED *saveptr = NULL; (tok = strtok_r( p, ",", &saveptr )); p = NULL ) { int val = x264_otoi( tok, -1 ); if( p ) { FAIL_IF_ERROR( val <= 0, "invalid step `%s'\n", tok ); h->step_size = val; continue; } FAIL_IF_ERROR( val < 0 || val >= h->step_size, "invalid offset `%s'\n", tok ); FAIL_IF_ERROR( h->pattern_len >= MAX_PATTERN_SIZE, "max pattern size %d reached\n", MAX_PATTERN_SIZE ); offsets[h->pattern_len++] = val; } FAIL_IF_ERROR( !h->step_size, "no step size provided\n" ); FAIL_IF_ERROR( !h->pattern_len, "no offsets supplied\n" ); h->pattern = malloc( h->pattern_len * sizeof(int) ); if( !h->pattern ) return -1; memcpy( h->pattern, offsets, h->pattern_len * sizeof(int) ); /* determine required cache size to maintain pattern. */ intptr_t max_rewind = 0; int min = h->step_size; for( int i = h->pattern_len-1; i >= 0; i-- ) { min = X264_MIN( min, offsets[i] ); if( i ) max_rewind = X264_MAX( max_rewind, offsets[i-1] - min + 1 ); /* reached maximum rewind size */ if( max_rewind == h->step_size ) break; } char name[20]; sprintf( name, "cache_%d", param->i_bitdepth ); if( x264_init_vid_filter( name, handle, filter, info, param, (void*)max_rewind ) ) return -1; /* done initing, overwrite properties */ if( h->step_size != h->pattern_len ) { info->num_frames = (uint64_t)info->num_frames * h->pattern_len / h->step_size; info->fps_den *= h->step_size; info->fps_num *= h->pattern_len; x264_reduce_fraction( &info->fps_num, &info->fps_den ); if( info->vfr ) { info->timebase_den *= h->pattern_len; info->timebase_num *= h->step_size; x264_reduce_fraction( &info->timebase_num, &info->timebase_den ); } } h->pts = 0; h->vfr = info->vfr; h->prev_filter = *filter; h->prev_hnd = *handle; *filter = select_every_filter; *handle = h; return 0; } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { selvry_hnd_t *h = handle; int pat_frame = h->pattern[frame % h->pattern_len] + frame / h->pattern_len * h->step_size; if( h->prev_filter.get_frame( h->prev_hnd, output, pat_frame ) ) return -1; if( h->vfr ) { output->pts = h->pts; h->pts += output->duration; } return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { selvry_hnd_t *h = handle; int pat_frame = h->pattern[frame % h->pattern_len] + frame / h->pattern_len * h->step_size; return h->prev_filter.release_frame( h->prev_hnd, pic, pat_frame ); } static void free_filter( hnd_t handle ) { selvry_hnd_t *h = handle; h->prev_filter.free( h->prev_hnd ); free( h->pattern ); free( h ); } cli_vid_filter_t select_every_filter = { NAME, help, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/source.c000066400000000000000000000053421502133446700173050ustar00rootroot00000000000000/***************************************************************************** * source.c: source video filter ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" /* This filter converts the demuxer API into the filtering API for video frames. * Backseeking is prohibited here as not all demuxers are capable of doing so. */ typedef struct { cli_pic_t pic; hnd_t hin; int cur_frame; } source_hnd_t; cli_vid_filter_t source_filter; static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { source_hnd_t *h = calloc( 1, sizeof(source_hnd_t) ); if( !h ) return -1; h->cur_frame = -1; if( cli_input.picture_alloc( &h->pic, *handle, info->csp, info->width, info->height ) ) return -1; h->hin = *handle; *handle = h; *filter = source_filter; return 0; } static int get_frame( hnd_t handle, cli_pic_t *output, int frame ) { source_hnd_t *h = handle; /* do not allow requesting of frames from before the current position */ if( frame <= h->cur_frame || cli_input.read_frame( &h->pic, h->hin, frame ) ) return -1; h->cur_frame = frame; *output = h->pic; return 0; } static int release_frame( hnd_t handle, cli_pic_t *pic, int frame ) { source_hnd_t *h = handle; if( cli_input.release_frame && cli_input.release_frame( &h->pic, h->hin ) ) return -1; return 0; } static void free_filter( hnd_t handle ) { source_hnd_t *h = handle; cli_input.picture_clean( &h->pic, h->hin ); cli_input.close_file( h->hin ); free( h ); } cli_vid_filter_t source_filter = { "source", NULL, init, get_frame, release_frame, free_filter, NULL }; x264-master/filters/video/video.c000066400000000000000000000053621502133446700171150ustar00rootroot00000000000000/***************************************************************************** * video.c: video filters ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "video.h" static cli_vid_filter_t *first_filter = NULL; static void register_vid_filter( cli_vid_filter_t *new_filter ) { cli_vid_filter_t *filter_i = first_filter; while( filter_i->next ) filter_i = filter_i->next; filter_i->next = new_filter; new_filter->next = NULL; } #define REGISTER_VFILTER(name)\ {\ extern cli_vid_filter_t name##_filter;\ register_vid_filter( &name##_filter );\ } void x264_register_vid_filters( void ) { extern cli_vid_filter_t source_filter; first_filter = &source_filter; #if HAVE_BITDEPTH8 REGISTER_VFILTER( cache_8 ); REGISTER_VFILTER( depth_8 ); #endif #if HAVE_BITDEPTH10 REGISTER_VFILTER( cache_10 ); REGISTER_VFILTER( depth_10 ); #endif REGISTER_VFILTER( crop ); REGISTER_VFILTER( fix_vfr_pts ); REGISTER_VFILTER( resize ); REGISTER_VFILTER( select_every ); #if HAVE_GPL #endif } int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ) { cli_vid_filter_t *filter_i = first_filter; while( filter_i && strcasecmp( name, filter_i->name ) ) filter_i = filter_i->next; FAIL_IF_ERR( !filter_i, "x264", "invalid filter `%s'\n", name ); if( filter_i->init( handle, filter, info, param, opt_string ) ) return -1; return 0; } void x264_vid_filter_help( int longhelp ) { for( cli_vid_filter_t *filter_i = first_filter; filter_i; filter_i = filter_i->next ) if( filter_i->help ) filter_i->help( longhelp ); } x264-master/filters/video/video.h000066400000000000000000000055631502133446700171250ustar00rootroot00000000000000/***************************************************************************** * video.h: video filters ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_FILTER_VIDEO_H #define X264_FILTER_VIDEO_H #include "input/input.h" #include "filters/filters.h" typedef struct cli_vid_filter_t cli_vid_filter_t; struct cli_vid_filter_t { /* name of the filter */ const char *name; /* help: a short message on what the filter does and how to use it. * this should only be implemented by filters directly accessible by the user */ void (*help)( int longhelp ); /* init: initializes the filter given the input clip properties and parameter to adjust them as necessary * with the given options provided by the user. * returns 0 on success, nonzero on error. */ int (*init)( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ); /* get_frame: given the storage for the output frame and desired frame number, generate the frame accordingly. * the image data returned by get_frame should be treated as const and not be altered. * returns 0 on success, nonzero on error. */ int (*get_frame)( hnd_t handle, cli_pic_t *output, int frame ); /* release_frame: frame is done being used and is signaled for cleanup. * returns 0 on succeess, nonzero on error. */ int (*release_frame)( hnd_t handle, cli_pic_t *pic, int frame ); /* free: run filter cleanup procedures. */ void (*free)( hnd_t handle ); /* next registered filter, unused by filters themselves */ cli_vid_filter_t *next; }; void x264_register_vid_filters( void ); void x264_vid_filter_help( int longhelp ); int x264_init_vid_filter( const char *name, hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x264_param_t *param, char *opt_string ); #endif x264-master/input/000077500000000000000000000000001502133446700142165ustar00rootroot00000000000000x264-master/input/avs.c000066400000000000000000000550301502133446700151560ustar00rootroot00000000000000/***************************************************************************** * avs.c: avisynth input ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Steven Walters * Anton Mitrofanov * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #if SYS_WINDOWS || SYS_CYGWIN #include #define avs_open() LoadLibraryW( L"avisynth" ) #define avs_close FreeLibrary #define avs_address GetProcAddress #else #include #if SYS_MACOSX #define avs_open() dlopen( "libavisynth.dylib", RTLD_NOW ) #else #define avs_open() dlopen( "libavisynth.so", RTLD_NOW ) #endif #define avs_close dlclose #define avs_address dlsym #endif #define AVSC_NO_DECLSPEC #undef EXTERN_C #include "extras/avisynth_c.h" #define AVSC_DECLARE_FUNC(name) name##_func name #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ ) /* AVS uses a versioned interface to control backwards compatibility */ /* YV12 support is required, which was added in 2.5 */ #define AVS_INTERFACE_25 2 #if HAVE_SWSCALE #include #endif /* maximum size of the sequence of filters to try on non script files */ #define AVS_MAX_SEQUENCE 5 #define LOAD_AVS_FUNC(name, continue_on_fail)\ {\ h->func.name = (void*)avs_address( h->library, #name );\ if( !continue_on_fail && !h->func.name )\ goto fail;\ } #define LOAD_AVS_FUNC_ALIAS(name, alias, continue_on_fail)\ {\ if( !h->func.name )\ h->func.name = (void*)avs_address( h->library, alias );\ if( !continue_on_fail && !h->func.name )\ goto fail;\ } typedef struct { AVS_Clip *clip; AVS_ScriptEnvironment *env; void *library; int num_frames; struct { AVSC_DECLARE_FUNC( avs_clip_get_error ); AVSC_DECLARE_FUNC( avs_create_script_environment ); AVSC_DECLARE_FUNC( avs_delete_script_environment ); AVSC_DECLARE_FUNC( avs_get_error ); AVSC_DECLARE_FUNC( avs_get_frame ); AVSC_DECLARE_FUNC( avs_get_video_info ); AVSC_DECLARE_FUNC( avs_function_exists ); AVSC_DECLARE_FUNC( avs_invoke ); AVSC_DECLARE_FUNC( avs_release_clip ); AVSC_DECLARE_FUNC( avs_release_value ); AVSC_DECLARE_FUNC( avs_release_video_frame ); AVSC_DECLARE_FUNC( avs_take_clip ); AVSC_DECLARE_FUNC( avs_is_yv24 ); AVSC_DECLARE_FUNC( avs_is_yv16 ); AVSC_DECLARE_FUNC( avs_is_yv12 ); AVSC_DECLARE_FUNC( avs_is_yv411 ); AVSC_DECLARE_FUNC( avs_is_y8 ); AVSC_DECLARE_FUNC( avs_get_pitch_p ); AVSC_DECLARE_FUNC( avs_get_read_ptr_p ); // AviSynth+ extension AVSC_DECLARE_FUNC( avs_is_rgb48 ); AVSC_DECLARE_FUNC( avs_is_rgb64 ); AVSC_DECLARE_FUNC( avs_is_yuv444p16 ); AVSC_DECLARE_FUNC( avs_is_yuv422p16 ); AVSC_DECLARE_FUNC( avs_is_yuv420p16 ); AVSC_DECLARE_FUNC( avs_is_y16 ); AVSC_DECLARE_FUNC( avs_is_444 ); AVSC_DECLARE_FUNC( avs_is_422 ); AVSC_DECLARE_FUNC( avs_is_420 ); AVSC_DECLARE_FUNC( avs_is_y ); } func; } avs_hnd_t; /* load the library and functions we require from it */ static int custom_avs_load_library( avs_hnd_t *h ) { h->library = avs_open(); if( !h->library ) return -1; LOAD_AVS_FUNC( avs_clip_get_error, 0 ); LOAD_AVS_FUNC( avs_create_script_environment, 0 ); LOAD_AVS_FUNC( avs_delete_script_environment, 1 ); LOAD_AVS_FUNC( avs_get_error, 1 ); LOAD_AVS_FUNC( avs_get_frame, 0 ); LOAD_AVS_FUNC( avs_get_video_info, 0 ); LOAD_AVS_FUNC( avs_function_exists, 0 ); LOAD_AVS_FUNC( avs_invoke, 0 ); LOAD_AVS_FUNC( avs_release_clip, 0 ); LOAD_AVS_FUNC( avs_release_value, 0 ); LOAD_AVS_FUNC( avs_release_video_frame, 0 ); LOAD_AVS_FUNC( avs_take_clip, 0 ); LOAD_AVS_FUNC( avs_is_yv24, 1 ); LOAD_AVS_FUNC( avs_is_yv16, 1 ); LOAD_AVS_FUNC( avs_is_yv12, 1 ); LOAD_AVS_FUNC( avs_is_yv411, 1 ); LOAD_AVS_FUNC( avs_is_y8, 1 ); LOAD_AVS_FUNC( avs_get_pitch_p, 1 ); LOAD_AVS_FUNC( avs_get_read_ptr_p, 1 ); // AviSynth+ extension LOAD_AVS_FUNC( avs_is_rgb48, 1 ); LOAD_AVS_FUNC_ALIAS( avs_is_rgb48, "_avs_is_rgb48@4", 1 ); LOAD_AVS_FUNC( avs_is_rgb64, 1 ); LOAD_AVS_FUNC_ALIAS( avs_is_rgb64, "_avs_is_rgb64@4", 1 ); LOAD_AVS_FUNC( avs_is_yuv444p16, 1 ); LOAD_AVS_FUNC( avs_is_yuv422p16, 1 ); LOAD_AVS_FUNC( avs_is_yuv420p16, 1 ); LOAD_AVS_FUNC( avs_is_y16, 1 ); LOAD_AVS_FUNC( avs_is_444, 1 ); LOAD_AVS_FUNC( avs_is_422, 1 ); LOAD_AVS_FUNC( avs_is_420, 1 ); LOAD_AVS_FUNC( avs_is_y, 1 ); return 0; fail: avs_close( h->library ); h->library = NULL; return -1; } #define AVS_IS_YV24( vi ) (h->func.avs_is_yv24 ? h->func.avs_is_yv24( vi ) : avs_is_yv24( vi )) #define AVS_IS_YV16( vi ) (h->func.avs_is_yv16 ? h->func.avs_is_yv16( vi ) : avs_is_yv16( vi )) #define AVS_IS_YV12( vi ) (h->func.avs_is_yv12 ? h->func.avs_is_yv12( vi ) : avs_is_yv12( vi )) #define AVS_IS_YV411( vi ) (h->func.avs_is_yv411 ? h->func.avs_is_yv411( vi ) : avs_is_yv411( vi )) #define AVS_IS_Y8( vi ) (h->func.avs_is_y8 ? h->func.avs_is_y8( vi ) : avs_is_y8( vi )) #define AVS_GET_PITCH_P( p, plane ) (h->func.avs_get_pitch_p ? h->func.avs_get_pitch_p( p, plane ) : avs_get_pitch_p( p, plane )) #define AVS_GET_READ_PTR_P( p, plane ) (h->func.avs_get_read_ptr_p ? h->func.avs_get_read_ptr_p( p, plane ) : avs_get_read_ptr_p( p, plane )) #define AVS_IS_AVISYNTHPLUS (h->func.avs_is_420 && h->func.avs_is_422 && h->func.avs_is_444) #define AVS_IS_420( vi ) (h->func.avs_is_420 ? h->func.avs_is_420( vi ) : AVS_IS_YV12( vi )) #define AVS_IS_422( vi ) (h->func.avs_is_422 ? h->func.avs_is_422( vi ) : AVS_IS_YV16( vi )) #define AVS_IS_444( vi ) (h->func.avs_is_444 ? h->func.avs_is_444( vi ) : AVS_IS_YV24( vi )) #define AVS_IS_RGB48( vi ) (h->func.avs_is_rgb48 && h->func.avs_is_rgb48( vi )) #define AVS_IS_RGB64( vi ) (h->func.avs_is_rgb64 && h->func.avs_is_rgb64( vi )) #define AVS_IS_YUV420P16( vi ) (h->func.avs_is_yuv420p16 && h->func.avs_is_yuv420p16( vi )) #define AVS_IS_YUV422P16( vi ) (h->func.avs_is_yuv422p16 && h->func.avs_is_yuv422p16( vi )) #define AVS_IS_YUV444P16( vi ) (h->func.avs_is_yuv444p16 && h->func.avs_is_yuv444p16( vi )) #define AVS_IS_Y( vi ) (h->func.avs_is_y ? h->func.avs_is_y( vi ) : AVS_IS_Y8( vi )) #define AVS_IS_Y16( vi ) (h->func.avs_is_y16 && h->func.avs_is_y16( vi )) /* generate a filter sequence to try based on the filename extension */ static void avs_build_filter_sequence( char *filename_ext, const char *filter[AVS_MAX_SEQUENCE+1] ) { int i = 0; #if SYS_WINDOWS || SYS_CYGWIN const char *all_purpose[] = { "FFmpegSource2", "DSS2", "DirectShowSource", 0 }; if( !strcasecmp( filename_ext, "avi" ) ) filter[i++] = "AVISource"; if( !strcasecmp( filename_ext, "d2v" ) ) filter[i++] = "MPEG2Source"; if( !strcasecmp( filename_ext, "dga" ) ) filter[i++] = "AVCSource"; #else const char *all_purpose[] = { "FFVideoSource", 0 }; #endif for( int j = 0; all_purpose[j] && i < AVS_MAX_SEQUENCE; j++ ) filter[i++] = all_purpose[j]; } static AVS_Value update_clip( avs_hnd_t *h, const AVS_VideoInfo **vi, AVS_Value res, AVS_Value release ) { h->func.avs_release_clip( h->clip ); h->clip = h->func.avs_take_clip( res, h->env ); h->func.avs_release_value( release ); *vi = h->func.avs_get_video_info( h->clip ); return res; } static float get_avs_version( avs_hnd_t *h ) { FAIL_IF_ERROR( !h->func.avs_function_exists( h->env, "VersionNumber" ), "VersionNumber does not exist\n" ); AVS_Value ver = h->func.avs_invoke( h->env, "VersionNumber", avs_new_value_array( NULL, 0 ), NULL ); FAIL_IF_ERROR( avs_is_error( ver ), "unable to determine avisynth version: %s\n", avs_as_error( ver ) ); FAIL_IF_ERROR( !avs_is_float( ver ), "VersionNumber did not return a float value\n" ); float ret = avs_as_float( ver ); h->func.avs_release_value( ver ); return ret; } #ifdef _WIN32 static char *utf16_to_ansi( const wchar_t *utf16 ) { BOOL invalid; int len = WideCharToMultiByte( CP_ACP, WC_NO_BEST_FIT_CHARS, utf16, -1, NULL, 0, NULL, &invalid ); if( len && !invalid ) { char *ansi = malloc( len * sizeof( char ) ); if( ansi ) { if( WideCharToMultiByte( CP_ACP, WC_NO_BEST_FIT_CHARS, utf16, -1, ansi, len, NULL, &invalid ) && !invalid ) return ansi; free( ansi ); } } return NULL; } static char *utf8_to_ansi( const char *filename ) { char *ansi = NULL; wchar_t *filename_utf16 = x264_utf8_to_utf16( filename ); if( filename_utf16 ) { /* Check if the filename already is valid ANSI. */ if( !(ansi = utf16_to_ansi( filename_utf16 )) ) { /* Check for a legacy 8.3 short filename. */ int len = GetShortPathNameW( filename_utf16, NULL, 0 ); if( len ) { wchar_t *short_utf16 = malloc( len * sizeof( wchar_t ) ); if( short_utf16 ) { if( GetShortPathNameW( filename_utf16, short_utf16, len ) ) ansi = utf16_to_ansi( short_utf16 ); free( short_utf16 ); } } } free( filename_utf16 ); } return ansi; } #endif static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { FILE *fh = x264_fopen( psz_filename, "r" ); if( !fh ) return -1; int b_regular = x264_is_regular_file( fh ); fclose( fh ); FAIL_IF_ERROR( !b_regular, "AVS input is incompatible with non-regular file `%s'\n", psz_filename ); avs_hnd_t *h = calloc( 1, sizeof(avs_hnd_t) ); if( !h ) return -1; FAIL_IF_ERROR( custom_avs_load_library( h ), "failed to load avisynth\n" ); h->env = h->func.avs_create_script_environment( AVS_INTERFACE_25 ); if( h->func.avs_get_error ) { const char *error = h->func.avs_get_error( h->env ); FAIL_IF_ERROR( error, "%s\n", error ); } float avs_version = get_avs_version( h ); if( avs_version <= 0 ) return -1; x264_cli_log( "avs", X264_LOG_DEBUG, "using avisynth version %.2f\n", avs_version ); #ifdef _WIN32 /* Avisynth doesn't support Unicode filenames. */ char *ansi_filename = utf8_to_ansi( psz_filename ); FAIL_IF_ERROR( !ansi_filename, "invalid ansi filename\n" ); AVS_Value arg = avs_new_value_string( ansi_filename ); #else AVS_Value arg = avs_new_value_string( psz_filename ); #endif AVS_Value res; char *filename_ext = get_filename_extension( psz_filename ); if( !strcasecmp( filename_ext, "avs" ) ) { res = h->func.avs_invoke( h->env, "Import", arg, NULL ); #ifdef _WIN32 free( ansi_filename ); #endif FAIL_IF_ERROR( avs_is_error( res ), "%s\n", avs_as_error( res ) ); /* check if the user is using a multi-threaded script and apply distributor if necessary. adapted from avisynth's vfw interface */ AVS_Value mt_test = h->func.avs_invoke( h->env, "GetMTMode", avs_new_value_bool( 0 ), NULL ); int mt_mode = avs_is_int( mt_test ) ? avs_as_int( mt_test ) : 0; h->func.avs_release_value( mt_test ); if( mt_mode > 0 && mt_mode < 5 ) { AVS_Value temp = h->func.avs_invoke( h->env, "Distributor", res, NULL ); h->func.avs_release_value( res ); res = temp; } } else /* non script file */ { /* cycle through known source filters to find one that works */ const char *filter[AVS_MAX_SEQUENCE+1] = { 0 }; avs_build_filter_sequence( filename_ext, filter ); int i; for( i = 0; filter[i]; i++ ) { x264_cli_log( "avs", X264_LOG_INFO, "trying %s... ", filter[i] ); if( !h->func.avs_function_exists( h->env, filter[i] ) ) { x264_cli_printf( X264_LOG_INFO, "not found\n" ); continue; } if( !strncasecmp( filter[i], "FFmpegSource", 12 ) ) { x264_cli_printf( X264_LOG_INFO, "indexing... " ); fflush( stderr ); } res = h->func.avs_invoke( h->env, filter[i], arg, NULL ); if( !avs_is_error( res ) ) { x264_cli_printf( X264_LOG_INFO, "succeeded\n" ); break; } x264_cli_printf( X264_LOG_INFO, "failed\n" ); } #ifdef _WIN32 free( ansi_filename ); #endif FAIL_IF_ERROR( !filter[i], "unable to find source filter to open `%s'\n", psz_filename ); } FAIL_IF_ERROR( !avs_is_clip( res ), "`%s' didn't return a video clip\n", psz_filename ); h->clip = h->func.avs_take_clip( res, h->env ); const AVS_VideoInfo *vi = h->func.avs_get_video_info( h->clip ); FAIL_IF_ERROR( !avs_has_video( vi ), "`%s' has no video data\n", psz_filename ); /* if the clip is made of fields instead of frames, call weave to make them frames */ if( avs_is_field_based( vi ) ) { x264_cli_log( "avs", X264_LOG_WARNING, "detected fieldbased (separated) input, weaving to frames\n" ); AVS_Value tmp = h->func.avs_invoke( h->env, "Weave", res, NULL ); FAIL_IF_ERROR( avs_is_error( tmp ), "couldn't weave fields into frames: %s\n", avs_as_error( tmp ) ); res = update_clip( h, &vi, tmp, res ); info->interlaced = 1; info->tff = avs_is_tff( vi ); } #if !HAVE_SWSCALE /* if swscale is not available, convert the CSP if necessary */ FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I400 || opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444), "avisynth >= 2.6 is required for i400/i422/i444 output\n" ); if( (opt->output_csp == X264_CSP_I400 && !AVS_IS_Y( vi )) || (opt->output_csp == X264_CSP_I420 && !AVS_IS_420( vi )) || (opt->output_csp == X264_CSP_I422 && !AVS_IS_422( vi )) || (opt->output_csp == X264_CSP_I444 && !AVS_IS_444( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) ) { const char *csp; if( AVS_IS_AVISYNTHPLUS ) { csp = opt->output_csp == X264_CSP_I400 ? "Y" : opt->output_csp == X264_CSP_I420 ? "YUV420" : opt->output_csp == X264_CSP_I422 ? "YUV422" : opt->output_csp == X264_CSP_I444 ? "YUV444" : "RGB"; } else { csp = opt->output_csp == X264_CSP_I400 ? "Y8" : opt->output_csp == X264_CSP_I420 ? "YV12" : opt->output_csp == X264_CSP_I422 ? "YV16" : opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB"; } x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp ); if( opt->output_csp != X264_CSP_I400 ) { FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1), "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height ); FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3), "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height ); FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1), "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height ); } char conv_func[16]; snprintf( conv_func, sizeof(conv_func), "ConvertTo%s", csp ); AVS_Value arg_arr[3]; const char *arg_name[3]; int arg_count = 1; arg_arr[0] = res; arg_name[0] = NULL; if( opt->output_csp != X264_CSP_I400 ) { arg_arr[arg_count] = avs_new_value_bool( info->interlaced ); arg_name[arg_count] = "interlaced"; arg_count++; } /* if doing a rgb <-> yuv conversion then range is handled via 'matrix'. though it's only supported in 2.56+ */ char matrix[7]; if( avs_version >= 2.56f && ((opt->output_csp == X264_CSP_RGB && avs_is_yuv( vi )) || (opt->output_csp != X264_CSP_RGB && avs_is_rgb( vi ))) ) { // if converting from yuv, then we specify the matrix for the input, otherwise use the output's. int use_pc_matrix = avs_is_yuv( vi ) ? opt->input_range == RANGE_PC : opt->output_range == RANGE_PC; snprintf( matrix, sizeof(matrix), "%s601", use_pc_matrix ? "PC." : "Rec" ); /* FIXME: use correct coefficients */ arg_arr[arg_count] = avs_new_value_string( matrix ); arg_name[arg_count] = "matrix"; arg_count++; // notification that the input range has changed to the desired one opt->input_range = opt->output_range; } AVS_Value res2 = h->func.avs_invoke( h->env, conv_func, avs_new_value_array( arg_arr, arg_count ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to %s: %s\n", csp, avs_as_error( res2 ) ); res = update_clip( h, &vi, res2, res ); } /* if swscale is not available, change the range if necessary. This only applies to YUV-based CSPs however */ if( avs_is_yuv( vi ) && opt->output_range != RANGE_AUTO && ((opt->input_range == RANGE_PC) != opt->output_range) ) { const char *levels = opt->output_range ? "TV->PC" : "PC->TV"; x264_cli_log( "avs", X264_LOG_WARNING, "performing %s conversion\n", levels ); AVS_Value arg_arr[2]; arg_arr[0] = res; arg_arr[1] = avs_new_value_string( levels ); const char *arg_name[] = { NULL, "levels" }; AVS_Value res2 = h->func.avs_invoke( h->env, "ColorYUV", avs_new_value_array( arg_arr, 2 ), arg_name ); FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert range: %s\n", avs_as_error( res2 ) ); res = update_clip( h, &vi, res2, res ); // notification that the input range has changed to the desired one opt->input_range = opt->output_range; } #endif h->func.avs_release_value( res ); info->width = vi->width; info->height = vi->height; info->fps_num = vi->fps_numerator; info->fps_den = vi->fps_denominator; h->num_frames = info->num_frames = vi->num_frames; info->thread_safe = 1; if( AVS_IS_RGB64( vi ) ) info->csp = X264_CSP_BGRA | X264_CSP_VFLIP | X264_CSP_HIGH_DEPTH; else if( avs_is_rgb32( vi ) ) info->csp = X264_CSP_BGRA | X264_CSP_VFLIP; else if( AVS_IS_RGB48( vi ) ) info->csp = X264_CSP_BGR | X264_CSP_VFLIP | X264_CSP_HIGH_DEPTH; else if( avs_is_rgb24( vi ) ) info->csp = X264_CSP_BGR | X264_CSP_VFLIP; else if( AVS_IS_YUV444P16( vi ) ) info->csp = X264_CSP_I444 | X264_CSP_HIGH_DEPTH; else if( AVS_IS_YV24( vi ) ) info->csp = X264_CSP_I444; else if( AVS_IS_YUV422P16( vi ) ) info->csp = X264_CSP_I422 | X264_CSP_HIGH_DEPTH; else if( AVS_IS_YV16( vi ) ) info->csp = X264_CSP_I422; else if( AVS_IS_YUV420P16( vi ) ) info->csp = X264_CSP_I420 | X264_CSP_HIGH_DEPTH; else if( AVS_IS_YV12( vi ) ) info->csp = X264_CSP_I420; else if( AVS_IS_Y16( vi ) ) info->csp = X264_CSP_I400 | X264_CSP_HIGH_DEPTH; else if( AVS_IS_Y8( vi ) ) info->csp = X264_CSP_I400; else if( avs_is_yuy2( vi ) ) info->csp = X264_CSP_YUYV; #if HAVE_SWSCALE else if( AVS_IS_YV411( vi ) ) info->csp = AV_PIX_FMT_YUV411P | X264_CSP_OTHER; #endif else { AVS_Value pixel_type = h->func.avs_invoke( h->env, "PixelType", res, NULL ); const char *pixel_type_name = avs_is_string( pixel_type ) ? avs_as_string( pixel_type ) : "unknown"; FAIL_IF_ERROR( 1, "not supported pixel type: %s\n", pixel_type_name ); } info->vfr = 0; *p_handle = h; return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; pic->img.csp = csp; const x264_cli_csp_t *cli_csp = x264_cli_get_csp( csp ); if( cli_csp ) pic->img.planes = cli_csp->planes; #if HAVE_SWSCALE else if( csp == (AV_PIX_FMT_YUV411P | X264_CSP_OTHER) ) pic->img.planes = 3; else pic->img.planes = 1; //y8 and yuy2 are one plane #endif return 0; } static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame ) { static const int plane[3] = { AVS_PLANAR_Y, AVS_PLANAR_U, AVS_PLANAR_V }; avs_hnd_t *h = handle; if( i_frame >= h->num_frames ) return -1; AVS_VideoFrame *frm = pic->opaque = h->func.avs_get_frame( h->clip, i_frame ); const char *err = h->func.avs_clip_get_error( h->clip ); FAIL_IF_ERROR( err, "%s occurred while reading frame %d\n", err, i_frame ); for( int i = 0; i < pic->img.planes; i++ ) { /* explicitly cast away the const attribute to avoid a warning */ pic->img.plane[i] = (uint8_t*)AVS_GET_READ_PTR_P( frm, plane[i] ); pic->img.stride[i] = AVS_GET_PITCH_P( frm, plane[i] ); } return 0; } static int release_frame( cli_pic_t *pic, hnd_t handle ) { avs_hnd_t *h = handle; h->func.avs_release_video_frame( pic->opaque ); return 0; } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { memset( pic, 0, sizeof(cli_pic_t) ); } static int close_file( hnd_t handle ) { avs_hnd_t *h = handle; if( h->func.avs_release_clip && h->clip ) h->func.avs_release_clip( h->clip ); if( h->func.avs_delete_script_environment && h->env ) h->func.avs_delete_script_environment( h->env ); if( h->library ) avs_close( h->library ); free( h ); return 0; } const cli_input_t avs_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; x264-master/input/ffms.c000066400000000000000000000173151502133446700153240ustar00rootroot00000000000000/***************************************************************************** * ffms.c: ffmpegsource input ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Mike Gurlitz * Steven Walters * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #include #undef DECLARE_ALIGNED #include #include #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "ffms", __VA_ARGS__ ) #define PROGRESS_LENGTH 36 typedef struct { FFMS_VideoSource *video_source; FFMS_Track *track; int reduce_pts; int vfr_input; int num_frames; int64_t time; } ffms_hnd_t; static int FFMS_CC update_progress( int64_t current, int64_t total, void *private ) { int64_t *update_time = private; int64_t oldtime = *update_time; int64_t newtime = x264_mdate(); if( oldtime && newtime - oldtime < UPDATE_INTERVAL ) return 0; *update_time = newtime; char buf[PROGRESS_LENGTH+5+1]; snprintf( buf, sizeof(buf), "ffms [info]: indexing input file [%.1f%%]", 100.0 * current / total ); fprintf( stderr, "%-*s\r", PROGRESS_LENGTH, buf+5 ); x264_cli_set_console_title( buf ); fflush( stderr ); return 0; } /* handle the deprecated jpeg pixel formats */ static int handle_jpeg( int csp, int *fullrange ) { switch( csp ) { case AV_PIX_FMT_YUVJ420P: *fullrange = 1; return AV_PIX_FMT_YUV420P; case AV_PIX_FMT_YUVJ422P: *fullrange = 1; return AV_PIX_FMT_YUV422P; case AV_PIX_FMT_YUVJ444P: *fullrange = 1; return AV_PIX_FMT_YUV444P; default: return csp; } } static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { ffms_hnd_t *h = calloc( 1, sizeof(ffms_hnd_t) ); if( !h ) return -1; FFMS_Init( 0, 1 ); FFMS_ErrorInfo e; e.BufferSize = 0; int seekmode = opt->seek ? FFMS_SEEK_NORMAL : FFMS_SEEK_LINEAR_NO_RW; FFMS_Index *idx = NULL; if( opt->index_file ) { x264_struct_stat index_s, input_s; if( !x264_stat( opt->index_file, &index_s ) && !x264_stat( psz_filename, &input_s ) && input_s.st_mtime < index_s.st_mtime ) { idx = FFMS_ReadIndex( opt->index_file, &e ); if( idx && FFMS_IndexBelongsToFile( idx, psz_filename, &e ) ) { FFMS_DestroyIndex( idx ); idx = NULL; } } } if( !idx ) { FFMS_Indexer *indexer = FFMS_CreateIndexer( psz_filename, &e ); FAIL_IF_ERROR( !indexer, "could not create indexer\n" ); if( opt->progress ) FFMS_SetProgressCallback( indexer, update_progress, &h->time ); idx = FFMS_DoIndexing2( indexer, FFMS_IEH_ABORT, &e ); fprintf( stderr, "%*c", PROGRESS_LENGTH+1, '\r' ); FAIL_IF_ERROR( !idx, "could not create index\n" ); if( opt->index_file && FFMS_WriteIndex( opt->index_file, idx, &e ) ) x264_cli_log( "ffms", X264_LOG_WARNING, "could not write index file\n" ); } int trackno = FFMS_GetFirstTrackOfType( idx, FFMS_TYPE_VIDEO, &e ); if( trackno >= 0 ) h->video_source = FFMS_CreateVideoSource( psz_filename, trackno, idx, 1, seekmode, &e ); FFMS_DestroyIndex( idx ); FAIL_IF_ERROR( trackno < 0, "could not find video track\n" ); FAIL_IF_ERROR( !h->video_source, "could not create video source\n" ); const FFMS_VideoProperties *videop = FFMS_GetVideoProperties( h->video_source ); info->num_frames = h->num_frames = videop->NumFrames; info->sar_height = videop->SARDen; info->sar_width = videop->SARNum; info->fps_den = videop->FPSDenominator; info->fps_num = videop->FPSNumerator; h->vfr_input = info->vfr; /* ffms is thread unsafe as it uses a single frame buffer for all frame requests */ info->thread_safe = 0; const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e ); FAIL_IF_ERROR( !frame, "could not read frame 0\n" ); info->fullrange = 0; info->width = frame->EncodedWidth; info->height = frame->EncodedHeight; info->csp = handle_jpeg( frame->EncodedPixelFormat, &info->fullrange ) | X264_CSP_OTHER; info->interlaced = frame->InterlacedFrame; info->tff = frame->TopFieldFirst; info->fullrange |= frame->ColorRange == FFMS_CR_JPEG; /* ffms timestamps are in milliseconds. ffms also uses int64_ts for timebase, * so we need to reduce large timebases to prevent overflow */ if( h->vfr_input ) { h->track = FFMS_GetTrackFromVideo( h->video_source ); const FFMS_TrackTimeBase *timebase = FFMS_GetTimeBase( h->track ); int64_t timebase_num = timebase->Num; int64_t timebase_den = timebase->Den * 1000; h->reduce_pts = 0; while( timebase_num > UINT32_MAX || timebase_den > INT32_MAX ) { timebase_num >>= 1; timebase_den >>= 1; h->reduce_pts++; } info->timebase_num = timebase_num; info->timebase_den = timebase_den; } *p_handle = h; return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; pic->img.csp = csp; pic->img.planes = 4; return 0; } static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame ) { ffms_hnd_t *h = handle; if( i_frame >= h->num_frames ) return -1; FFMS_ErrorInfo e; e.BufferSize = 0; const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, i_frame, &e ); FAIL_IF_ERROR( !frame, "could not read frame %d \n", i_frame ); memcpy( pic->img.stride, frame->Linesize, sizeof(pic->img.stride) ); memcpy( pic->img.plane, frame->Data, sizeof(pic->img.plane) ); int is_fullrange = 0; pic->img.width = frame->EncodedWidth; pic->img.height = frame->EncodedHeight; pic->img.csp = handle_jpeg( frame->EncodedPixelFormat, &is_fullrange ) | X264_CSP_OTHER; if( h->vfr_input ) { const FFMS_FrameInfo *info = FFMS_GetFrameInfo( h->track, i_frame ); FAIL_IF_ERROR( info->PTS == AV_NOPTS_VALUE, "invalid timestamp. " "Use --force-cfr and specify a framerate with --fps\n" ); pic->pts = info->PTS >> h->reduce_pts; pic->duration = 0; } return 0; } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { memset( pic, 0, sizeof(cli_pic_t) ); } static int close_file( hnd_t handle ) { ffms_hnd_t *h = handle; FFMS_DestroyVideoSource( h->video_source ); free( h ); return 0; } const cli_input_t ffms_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file }; x264-master/input/input.c000066400000000000000000000240701502133446700155240ustar00rootroot00000000000000/***************************************************************************** * input.c: common input functions ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Steven Walters * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #ifdef _WIN32 #include #elif HAVE_MMAP #include #include #endif const x264_cli_csp_t x264_cli_csps[] = { [X264_CSP_I400] = { "i400", 1, { 1 }, { 1 }, 1, 1 }, [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 }, [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 }, [X264_CSP_I444] = { "i444", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 }, [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 }, [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 }, [X264_CSP_YUYV] = { "yuyv", 1, { 2 }, { 1 }, 2, 1 }, [X264_CSP_UYVY] = { "uyvy", 1, { 2 }, { 1 }, 2, 1 }, [X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 }, [X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 }, [X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 }, }; int x264_cli_csp_is_invalid( int csp ) { int csp_mask = csp & X264_CSP_MASK; return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER; } int x264_cli_csp_depth_factor( int csp ) { if( x264_cli_csp_is_invalid( csp ) ) return 0; return (csp & X264_CSP_HIGH_DEPTH) ? 2 : 1; } int64_t x264_cli_pic_plane_size( int csp, int width, int height, int plane ) { int csp_mask = csp & X264_CSP_MASK; if( x264_cli_csp_is_invalid( csp ) || plane < 0 || plane >= x264_cli_csps[csp_mask].planes ) return 0; int64_t size = (int64_t)width * height; size *= x264_cli_csps[csp_mask].width[plane] * x264_cli_csps[csp_mask].height[plane]; size *= x264_cli_csp_depth_factor( csp ); return size; } int64_t x264_cli_pic_size( int csp, int width, int height ) { if( x264_cli_csp_is_invalid( csp ) ) return 0; int64_t size = 0; int csp_mask = csp & X264_CSP_MASK; for( int i = 0; i < x264_cli_csps[csp_mask].planes; i++ ) size += x264_cli_pic_plane_size( csp, width, height, i ); return size; } static int cli_pic_init_internal( cli_pic_t *pic, int csp, int width, int height, int align, int alloc ) { memset( pic, 0, sizeof(cli_pic_t) ); int csp_mask = csp & X264_CSP_MASK; if( x264_cli_csp_is_invalid( csp ) ) pic->img.planes = 0; else pic->img.planes = x264_cli_csps[csp_mask].planes; pic->img.csp = csp; pic->img.width = width; pic->img.height = height; for( int i = 0; i < pic->img.planes; i++ ) { int stride = width * x264_cli_csps[csp_mask].width[i]; stride *= x264_cli_csp_depth_factor( csp ); stride = ALIGN( stride, align ); pic->img.stride[i] = stride; if( alloc ) { int64_t size = (int64_t)(height * x264_cli_csps[csp_mask].height[i]) * stride; pic->img.plane[i] = x264_malloc( size ); if( !pic->img.plane[i] ) return -1; } } return 0; } int x264_cli_pic_alloc( cli_pic_t *pic, int csp, int width, int height ) { return cli_pic_init_internal( pic, csp, width, height, 1, 1 ); } int x264_cli_pic_alloc_aligned( cli_pic_t *pic, int csp, int width, int height ) { return cli_pic_init_internal( pic, csp, width, height, NATIVE_ALIGN, 1 ); } int x264_cli_pic_init_noalloc( cli_pic_t *pic, int csp, int width, int height ) { return cli_pic_init_internal( pic, csp, width, height, 1, 0 ); } void x264_cli_pic_clean( cli_pic_t *pic ) { for( int i = 0; i < pic->img.planes; i++ ) x264_free( pic->img.plane[i] ); memset( pic, 0, sizeof(cli_pic_t) ); } const x264_cli_csp_t *x264_cli_get_csp( int csp ) { if( x264_cli_csp_is_invalid( csp ) ) return NULL; return x264_cli_csps + (csp&X264_CSP_MASK); } /* Functions for handling memory-mapped input frames */ int x264_cli_mmap_init( cli_mmap_t *h, FILE *fh ) { #if defined(_WIN32) || HAVE_MMAP int fd = fileno( fh ); x264_struct_stat file_stat; if( !x264_fstat( fd, &file_stat ) ) { h->file_size = file_stat.st_size; #ifdef _WIN32 HANDLE osfhandle = (HANDLE)_get_osfhandle( fd ); if( osfhandle != INVALID_HANDLE_VALUE ) { SYSTEM_INFO si; GetSystemInfo( &si ); h->page_mask = si.dwPageSize - 1; h->align_mask = si.dwAllocationGranularity - 1; h->prefetch_virtual_memory = (void*)GetProcAddress( GetModuleHandleW( L"kernel32.dll" ), "PrefetchVirtualMemory" ); h->process_handle = GetCurrentProcess(); h->map_handle = CreateFileMappingW( osfhandle, NULL, PAGE_READONLY, 0, 0, NULL ); return !h->map_handle; } #elif HAVE_MMAP && defined(_SC_PAGESIZE) h->align_mask = sysconf( _SC_PAGESIZE ) - 1; h->fd = fd; return h->align_mask < 0 || fd < 0; #endif } #endif return -1; } /* Third-party filters such as swscale can overread the input buffer which may result * in segfaults. We have to pad the buffer size as a workaround to avoid that. */ #define MMAP_PADDING 64 void *x264_cli_mmap( cli_mmap_t *h, int64_t offset, int64_t size ) { #if defined(_WIN32) || HAVE_MMAP uint8_t *base; int align = offset & h->align_mask; if( offset < 0 || size < 0 || (uint64_t)size > (SIZE_MAX - MMAP_PADDING - align) ) return NULL; offset -= align; size += align; #ifdef _WIN32 /* If the padding crosses a page boundary we need to increase the mapping size. */ size_t padded_size = (-size & h->page_mask) < MMAP_PADDING ? size + MMAP_PADDING : size; if( (uint64_t)offset + padded_size > (uint64_t)h->file_size ) { /* It's not possible to do the POSIX mmap() remapping trick on Windows, so if the padding crosses a * page boundary past the end of the file we have to copy the entire frame into a padded buffer. */ if( (base = MapViewOfFile( h->map_handle, FILE_MAP_READ, (uint64_t)offset >> 32, offset, size )) ) { uint8_t *buf = NULL; HANDLE anon_map = CreateFileMappingW( INVALID_HANDLE_VALUE, NULL, PAGE_READWRITE, (uint64_t)padded_size >> 32, padded_size, NULL ); if( anon_map ) { if( (buf = MapViewOfFile( anon_map, FILE_MAP_WRITE, 0, 0, 0 )) ) { buf += align; memcpy( buf, base + align, size - align ); } CloseHandle( anon_map ); } UnmapViewOfFile( base ); return buf; } } else if( (base = MapViewOfFile( h->map_handle, FILE_MAP_READ, (uint64_t)offset >> 32, offset, padded_size )) ) { /* PrefetchVirtualMemory() is only available on Windows 8 and newer. */ if( h->prefetch_virtual_memory ) { struct { void *addr; size_t size; } mem_range = { base, size }; h->prefetch_virtual_memory( h->process_handle, 1, &mem_range, 0 ); } return base + align; } #else size_t padded_size = size + MMAP_PADDING; if( (base = mmap( NULL, padded_size, PROT_READ, MAP_PRIVATE, h->fd, offset )) != MAP_FAILED ) { /* Ask the OS to readahead pages. This improves performance whereas * forcing page faults by manually accessing every page does not. * Some systems have implemented madvise() but not posix_madvise() * and vice versa, so check both to see if either is available. */ #ifdef MADV_WILLNEED madvise( base, size, MADV_WILLNEED ); #elif defined(POSIX_MADV_WILLNEED) posix_madvise( base, size, POSIX_MADV_WILLNEED ); #endif /* Remap the file mapping of any padding that crosses a page boundary past the end of * the file into a copy of the last valid page to prevent reads from invalid memory. */ size_t aligned_size = (padded_size - 1) & ~h->align_mask; if( offset + aligned_size >= h->file_size ) mmap( base + aligned_size, padded_size - aligned_size, PROT_READ, MAP_PRIVATE|MAP_FIXED, h->fd, (offset + size - 1) & ~h->align_mask ); return base + align; } #endif #endif return NULL; } int x264_cli_munmap( cli_mmap_t *h, void *addr, int64_t size ) { #if defined(_WIN32) || HAVE_MMAP void *base = (void*)((intptr_t)addr & ~h->align_mask); #ifdef _WIN32 return !UnmapViewOfFile( base ); #else if( size < 0 || size > (SIZE_MAX - MMAP_PADDING - ((intptr_t)addr - (intptr_t)base)) ) return -1; return munmap( base, size + MMAP_PADDING + (intptr_t)addr - (intptr_t)base ); #endif #endif return -1; } void x264_cli_mmap_close( cli_mmap_t *h ) { #ifdef _WIN32 CloseHandle( h->map_handle ); #endif } x264-master/input/input.h000066400000000000000000000121371502133446700155320ustar00rootroot00000000000000/***************************************************************************** * input.h: file input ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_INPUT_H #define X264_INPUT_H #include "x264cli.h" #ifdef _WIN32 #include #endif /* options that are used by only some demuxers */ typedef struct { char *index_file; char *format; char *resolution; char *colorspace; int bit_depth; char *timebase; int seek; int progress; int output_csp; /* convert to this csp, if applicable */ int output_range; /* user desired output range */ int input_range; /* user override input range */ } cli_input_opt_t; /* properties of the source given by the demuxer */ typedef struct { int csp; /* colorspace of the input */ uint32_t fps_num; uint32_t fps_den; int fullrange; /* has 2^bit_depth-1 instead of 219*2^(bit_depth-8) ranges (YUV only) */ int width; int height; int interlaced; int num_frames; uint32_t sar_width; uint32_t sar_height; int tff; int thread_safe; /* demuxer is thread_input safe */ uint32_t timebase_num; uint32_t timebase_den; int vfr; } video_info_t; /* image data type used by x264cli */ typedef struct { int csp; /* colorspace */ int width; /* width of the picture */ int height; /* height of the picture */ int planes; /* number of planes */ uint8_t *plane[4]; /* pointers for each plane */ int stride[4]; /* strides for each plane */ } cli_image_t; typedef struct { cli_image_t img; int64_t pts; /* input pts */ int64_t duration; /* frame duration - used for vfr */ void *opaque; /* opaque handle */ } cli_pic_t; typedef struct { int (*open_file)( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ); int (*picture_alloc)( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ); int (*read_frame)( cli_pic_t *pic, hnd_t handle, int i_frame ); int (*release_frame)( cli_pic_t *pic, hnd_t handle ); void (*picture_clean)( cli_pic_t *pic, hnd_t handle ); int (*close_file)( hnd_t handle ); } cli_input_t; extern const cli_input_t raw_input; extern const cli_input_t y4m_input; extern const cli_input_t avs_input; extern const cli_input_t thread_8_input; extern const cli_input_t thread_10_input; extern const cli_input_t lavf_input; extern const cli_input_t ffms_input; extern const cli_input_t timecode_input; extern cli_input_t cli_input; /* extended colorspace list that isn't supported by libx264 but by the cli */ #define X264_CSP_CLI_MAX X264_CSP_MAX /* end of list */ #define X264_CSP_OTHER 0x4000 /* non x264 colorspace */ typedef struct { const char *name; int planes; float width[4]; float height[4]; int mod_width; int mod_height; } x264_cli_csp_t; extern const x264_cli_csp_t x264_cli_csps[]; int x264_cli_csp_is_invalid( int csp ); int x264_cli_csp_depth_factor( int csp ); int x264_cli_pic_alloc( cli_pic_t *pic, int csp, int width, int height ); int x264_cli_pic_alloc_aligned( cli_pic_t *pic, int csp, int width, int height ); int x264_cli_pic_init_noalloc( cli_pic_t *pic, int csp, int width, int height ); void x264_cli_pic_clean( cli_pic_t *pic ); int64_t x264_cli_pic_plane_size( int csp, int width, int height, int plane ); int64_t x264_cli_pic_size( int csp, int width, int height ); const x264_cli_csp_t *x264_cli_get_csp( int csp ); typedef struct { int64_t file_size; int align_mask; #ifdef _WIN32 int page_mask; BOOL (WINAPI *prefetch_virtual_memory)( HANDLE, ULONG_PTR, PVOID, ULONG ); HANDLE process_handle; HANDLE map_handle; #elif HAVE_MMAP int fd; #endif } cli_mmap_t; int x264_cli_mmap_init( cli_mmap_t *h, FILE *fh ); void *x264_cli_mmap( cli_mmap_t *h, int64_t offset, int64_t size ); int x264_cli_munmap( cli_mmap_t *h, void *addr, int64_t size ); void x264_cli_mmap_close( cli_mmap_t *h ); #endif x264-master/input/lavf.c000066400000000000000000000223541502133446700153200ustar00rootroot00000000000000/***************************************************************************** * lavf.c: libavformat input ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Mike Gurlitz * Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #undef DECLARE_ALIGNED #include #include #include #include #include #include #include #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "lavf", __VA_ARGS__ ) typedef struct { AVFormatContext *lavf; AVCodecContext *lavc; AVFrame *frame; AVPacket *pkt; int stream_id; int next_frame; int vfr_input; cli_pic_t *first_pic; } lavf_hnd_t; /* handle the deprecated jpeg pixel formats */ static int handle_jpeg( int csp, int *fullrange ) { switch( csp ) { case AV_PIX_FMT_YUVJ420P: *fullrange = 1; return AV_PIX_FMT_YUV420P; case AV_PIX_FMT_YUVJ422P: *fullrange = 1; return AV_PIX_FMT_YUV422P; case AV_PIX_FMT_YUVJ444P: *fullrange = 1; return AV_PIX_FMT_YUV444P; default: return csp; } } static AVCodecContext *codec_from_stream( AVStream *stream ) { AVCodec *codec = avcodec_find_decoder( stream->codecpar->codec_id ); if( !codec ) return NULL; AVCodecContext *c = avcodec_alloc_context3( codec ); if( !c ) return NULL; if( avcodec_parameters_to_context( c, stream->codecpar ) < 0 ) { avcodec_free_context( &c ); return NULL; } return c; } static int read_frame_internal( cli_pic_t *p_pic, lavf_hnd_t *h, int i_frame, video_info_t *info ) { if( h->first_pic && !info ) { /* see if the frame we are requesting is the frame we have already read and stored. * if so, retrieve the pts and image data before freeing it. */ if( !i_frame ) { XCHG( cli_image_t, p_pic->img, h->first_pic->img ); p_pic->pts = h->first_pic->pts; } lavf_input.picture_clean( h->first_pic, h ); free( h->first_pic ); h->first_pic = NULL; if( !i_frame ) return 0; } AVPacket *pkt = h->pkt; while( i_frame >= h->next_frame ) { int ret; while( (ret = avcodec_receive_frame( h->lavc, h->frame )) ) { if( ret == AVERROR(EAGAIN) ) { while( !(ret = av_read_frame( h->lavf, pkt )) && pkt->stream_index != h->stream_id ) av_packet_unref( pkt ); if( ret ) ret = avcodec_send_packet( h->lavc, NULL ); else { ret = avcodec_send_packet( h->lavc, pkt ); av_packet_unref( pkt ); } } else if( ret == AVERROR_EOF ) return -1; if( ret ) { x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame ); return -1; } } h->next_frame++; } memcpy( p_pic->img.stride, h->frame->linesize, sizeof(p_pic->img.stride) ); memcpy( p_pic->img.plane, h->frame->data, sizeof(p_pic->img.plane) ); int is_fullrange = 0; p_pic->img.width = h->lavc->width; p_pic->img.height = h->lavc->height; p_pic->img.csp = handle_jpeg( h->lavc->pix_fmt, &is_fullrange ) | X264_CSP_OTHER; if( info ) { info->fullrange = is_fullrange; #if LIBAVUTIL_VERSION_MAJOR < 60 info->interlaced = h->frame->interlaced_frame; info->tff = h->frame->top_field_first; #else info->interlaced = !!(h->frame->flags & AV_FRAME_FLAG_INTERLACED); info->tff = !!(h->frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST); #endif } if( h->vfr_input ) { p_pic->pts = p_pic->duration = 0; if( h->frame->pts != AV_NOPTS_VALUE ) p_pic->pts = h->frame->pts; else if( h->frame->pkt_dts != AV_NOPTS_VALUE ) p_pic->pts = h->frame->pkt_dts; // for AVI files else if( info ) { h->vfr_input = info->vfr = 0; return 0; } } return 0; } static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { lavf_hnd_t *h = calloc( 1, sizeof(lavf_hnd_t) ); if( !h ) return -1; if( !strcmp( psz_filename, "-" ) ) psz_filename = "pipe:"; h->frame = av_frame_alloc(); if( !h->frame ) return -1; h->pkt = av_packet_alloc(); if( !h->pkt ) return -1; /* if resolution was passed in, place it and colorspace into options. this allows raw video support */ AVDictionary *options = NULL; if( opt->resolution ) { av_dict_set( &options, "video_size", opt->resolution, 0 ); const char *csp = opt->colorspace ? opt->colorspace : av_get_pix_fmt_name( AV_PIX_FMT_YUV420P ); av_dict_set( &options, "pixel_format", csp, 0 ); } /* specify the input format. this is helpful when lavf fails to guess */ AVInputFormat *format = NULL; if( opt->format ) FAIL_IF_ERROR( !(format = av_find_input_format( opt->format )), "unknown file format: %s\n", opt->format ); FAIL_IF_ERROR( avformat_open_input( &h->lavf, psz_filename, format, &options ), "could not open input file\n" ); if( options ) av_dict_free( &options ); FAIL_IF_ERROR( avformat_find_stream_info( h->lavf, NULL ) < 0, "could not find input stream info\n" ); int i = 0; while( i < h->lavf->nb_streams && h->lavf->streams[i]->codecpar->codec_type != AVMEDIA_TYPE_VIDEO ) i++; FAIL_IF_ERROR( i == h->lavf->nb_streams, "could not find video stream\n" ); h->stream_id = i; h->next_frame = 0; h->lavc = codec_from_stream( h->lavf->streams[i] ); if( !h->lavc ) return -1; info->fps_num = h->lavf->streams[i]->avg_frame_rate.num; info->fps_den = h->lavf->streams[i]->avg_frame_rate.den; info->timebase_num = h->lavf->streams[i]->time_base.num; info->timebase_den = h->lavf->streams[i]->time_base.den; /* lavf is thread unsafe as calling av_read_frame invalidates previously read AVPackets */ info->thread_safe = 0; h->vfr_input = info->vfr; FAIL_IF_ERROR( avcodec_open2( h->lavc, avcodec_find_decoder( h->lavc->codec_id ), NULL ), "could not find decoder for video stream\n" ); /* prefetch the first frame and set/confirm flags */ h->first_pic = malloc( sizeof(cli_pic_t) ); FAIL_IF_ERROR( !h->first_pic || lavf_input.picture_alloc( h->first_pic, h, X264_CSP_OTHER, info->width, info->height ), "malloc failed\n" ); if( read_frame_internal( h->first_pic, h, 0, info ) ) return -1; info->width = h->lavc->width; info->height = h->lavc->height; info->csp = h->first_pic->img.csp; info->num_frames = h->lavf->streams[i]->nb_frames; info->sar_height = h->lavc->sample_aspect_ratio.den; info->sar_width = h->lavc->sample_aspect_ratio.num; info->fullrange |= h->lavc->color_range == AVCOL_RANGE_JPEG; /* avisynth stores rgb data vertically flipped. */ if( !strcasecmp( get_filename_extension( psz_filename ), "avs" ) && (h->lavc->pix_fmt == AV_PIX_FMT_BGRA || h->lavc->pix_fmt == AV_PIX_FMT_BGR24) ) info->csp |= X264_CSP_VFLIP; *p_handle = h; return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { if( x264_cli_pic_alloc( pic, X264_CSP_NONE, width, height ) ) return -1; pic->img.csp = csp; pic->img.planes = 4; return 0; } static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame ) { return read_frame_internal( pic, handle, i_frame, NULL ); } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { memset( pic, 0, sizeof(cli_pic_t) ); } static int close_file( hnd_t handle ) { lavf_hnd_t *h = handle; avcodec_free_context( &h->lavc ); avformat_close_input( &h->lavf ); av_packet_free( &h->pkt ); av_frame_free( &h->frame ); free( h ); return 0; } const cli_input_t lavf_input = { open_file, picture_alloc, read_frame, NULL, picture_clean, close_file }; x264-master/input/raw.c000066400000000000000000000150451502133446700151600ustar00rootroot00000000000000/***************************************************************************** * raw.c: raw input ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Steven Walters * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "raw", __VA_ARGS__ ) typedef struct { FILE *fh; int next_frame; int64_t plane_size[4]; int64_t frame_size; int bit_depth; cli_mmap_t mmap; int use_mmap; } raw_hnd_t; static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { raw_hnd_t *h = calloc( 1, sizeof(raw_hnd_t) ); if( !h ) return -1; if( !opt->resolution ) { /* try to parse the file name */ for( char *p = psz_filename; *p; p++ ) if( *p >= '0' && *p <= '9' && sscanf( p, "%dx%d", &info->width, &info->height ) == 2 ) break; } else sscanf( opt->resolution, "%dx%d", &info->width, &info->height ); FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" ); if( opt->colorspace ) { for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- ) { if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) ) break; } FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace ); } else /* default */ info->csp = X264_CSP_I420; h->bit_depth = opt->bit_depth; FAIL_IF_ERROR( h->bit_depth < 8 || h->bit_depth > 16, "unsupported bit depth `%d'\n", h->bit_depth ); if( h->bit_depth > 8 ) info->csp |= X264_CSP_HIGH_DEPTH; if( !strcmp( psz_filename, "-" ) ) h->fh = stdin; else h->fh = x264_fopen( psz_filename, "rb" ); if( h->fh == NULL ) return -1; info->thread_safe = 1; info->num_frames = 0; info->vfr = 0; const x264_cli_csp_t *csp = x264_cli_get_csp( info->csp ); for( int i = 0; i < csp->planes; i++ ) { h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i ); h->frame_size += h->plane_size[i]; /* x264_cli_pic_plane_size returns the size in bytes, we need the value in pixels from here on */ h->plane_size[i] /= x264_cli_csp_depth_factor( info->csp ); } if( x264_is_regular_file( h->fh ) ) { fseek( h->fh, 0, SEEK_END ); int64_t size = ftell( h->fh ); fseek( h->fh, 0, SEEK_SET ); info->num_frames = size / h->frame_size; FAIL_IF_ERROR( !info->num_frames, "empty input file\n" ); /* Attempt to use memory-mapped input frames if possible */ if( !(h->bit_depth & 7) ) h->use_mmap = !x264_cli_mmap_init( &h->mmap, h->fh ); } *p_handle = h; return 0; } static int read_frame_internal( cli_pic_t *pic, raw_hnd_t *h, int bit_depth_uc ) { int pixel_depth = x264_cli_csp_depth_factor( pic->img.csp ); for( int i = 0; i < pic->img.planes; i++ ) { if( h->use_mmap ) { if( i ) pic->img.plane[i] = pic->img.plane[i-1] + pixel_depth * h->plane_size[i-1]; } else if( fread( pic->img.plane[i], pixel_depth, h->plane_size[i], h->fh ) != (uint64_t)h->plane_size[i] ) return -1; if( bit_depth_uc ) { /* upconvert non 16bit high depth planes to 16bit using the same * algorithm as used in the depth filter. */ uint16_t *plane = (uint16_t*)pic->img.plane[i]; int64_t pixel_count = h->plane_size[i]; int lshift = 16 - h->bit_depth; for( int64_t j = 0; j < pixel_count; j++ ) plane[j] = plane[j] << lshift; } } return 0; } static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame ) { raw_hnd_t *h = handle; if( h->use_mmap ) { pic->img.plane[0] = x264_cli_mmap( &h->mmap, i_frame * h->frame_size, h->frame_size ); if( !pic->img.plane[0] ) return -1; } else if( i_frame > h->next_frame ) { if( x264_is_regular_file( h->fh ) ) fseek( h->fh, i_frame * h->frame_size, SEEK_SET ); else while( i_frame > h->next_frame ) { if( read_frame_internal( pic, h, 0 ) ) return -1; h->next_frame++; } } if( read_frame_internal( pic, h, h->bit_depth & 7 ) ) return -1; h->next_frame = i_frame+1; return 0; } static int release_frame( cli_pic_t *pic, hnd_t handle ) { raw_hnd_t *h = handle; if( h->use_mmap ) return x264_cli_munmap( &h->mmap, pic->img.plane[0], h->frame_size ); return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { raw_hnd_t *h = handle; return (h->use_mmap ? x264_cli_pic_init_noalloc : x264_cli_pic_alloc)( pic, csp, width, height ); } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { raw_hnd_t *h = handle; if( h->use_mmap ) memset( pic, 0, sizeof(cli_pic_t) ); else x264_cli_pic_clean( pic ); } static int close_file( hnd_t handle ) { raw_hnd_t *h = handle; if( !h || !h->fh ) return 0; if( h->use_mmap ) x264_cli_mmap_close( &h->mmap ); fclose( h->fh ); free( h ); return 0; } const cli_input_t raw_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; x264-master/input/thread.c000066400000000000000000000102651502133446700156350ustar00rootroot00000000000000/***************************************************************************** * thread.c: threaded input ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #include "common/common.h" #define thread_input x264_glue3(thread, BIT_DEPTH, input) typedef struct { cli_input_t input; hnd_t p_handle; cli_pic_t pic; x264_threadpool_t *pool; int next_frame; int frame_total; struct thread_input_arg_t *next_args; } thread_hnd_t; typedef struct thread_input_arg_t { thread_hnd_t *h; cli_pic_t *pic; int i_frame; int status; } thread_input_arg_t; static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { thread_hnd_t *h = malloc( sizeof(thread_hnd_t) ); FAIL_IF_ERR( !h || cli_input.picture_alloc( &h->pic, *p_handle, info->csp, info->width, info->height ), "x264", "malloc failed\n" ); h->input = cli_input; h->p_handle = *p_handle; h->next_frame = -1; h->next_args = malloc( sizeof(thread_input_arg_t) ); if( !h->next_args ) return -1; h->next_args->h = h; h->next_args->status = 0; h->frame_total = info->num_frames; if( x264_threadpool_init( &h->pool, 1 ) ) return -1; *p_handle = h; return 0; } static void read_frame_thread_int( thread_input_arg_t *i ) { i->status = i->h->input.read_frame( i->pic, i->h->p_handle, i->i_frame ); } static int read_frame( cli_pic_t *p_pic, hnd_t handle, int i_frame ) { thread_hnd_t *h = handle; int ret = 0; if( h->next_frame >= 0 ) { x264_threadpool_wait( h->pool, h->next_args ); ret |= h->next_args->status; } if( h->next_frame == i_frame ) XCHG( cli_pic_t, *p_pic, h->pic ); else { if( h->next_frame >= 0 ) thread_input.release_frame( &h->pic, handle ); ret |= h->input.read_frame( p_pic, h->p_handle, i_frame ); } if( !h->frame_total || i_frame+1 < h->frame_total ) { h->next_frame = h->next_args->i_frame = i_frame+1; h->next_args->pic = &h->pic; x264_threadpool_run( h->pool, (void*)read_frame_thread_int, h->next_args ); } else h->next_frame = -1; return ret; } static int release_frame( cli_pic_t *pic, hnd_t handle ) { thread_hnd_t *h = handle; if( h->input.release_frame ) return h->input.release_frame( pic, h->p_handle ); return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { thread_hnd_t *h = handle; return h->input.picture_alloc( pic, h->p_handle, csp, width, height ); } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { thread_hnd_t *h = handle; h->input.picture_clean( pic, h->p_handle ); } static int close_file( hnd_t handle ) { thread_hnd_t *h = handle; x264_threadpool_delete( h->pool ); h->input.picture_clean( &h->pic, h->p_handle ); h->input.close_file( h->p_handle ); free( h->next_args ); free( h ); return 0; } const cli_input_t thread_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; x264-master/input/timecode.c000066400000000000000000000414451502133446700161630ustar00rootroot00000000000000/***************************************************************************** * timecode.c: timecode file input ***************************************************************************** * Copyright (C) 2010-2025 x264 project * * Authors: Yusuke Nakamura * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "timecode", __VA_ARGS__ ) typedef struct { cli_input_t input; hnd_t p_handle; int auto_timebase_num; int auto_timebase_den; uint64_t timebase_num; uint64_t timebase_den; int stored_pts_num; int64_t *pts; double assume_fps; double last_timecode; } timecode_hnd_t; static inline double sigexp10( double value, double *exponent ) { /* This function separates significand and exp10 from double floating point. */ *exponent = pow( 10, floor( log10( value ) ) ); return value / *exponent; } #define DOUBLE_EPSILON 5e-6 #define MKV_TIMEBASE_DEN 1000000000 static double correct_fps( double fps, timecode_hnd_t *h ) { int i = 1; uint64_t fps_num, fps_den; double exponent; double fps_sig = sigexp10( fps, &exponent ); while( 1 ) { fps_den = i * h->timebase_num; fps_num = round( fps_den * fps_sig ) * exponent; FAIL_IF_ERROR( fps_num > UINT32_MAX, "tcfile fps correction failed.\n" " Specify an appropriate timebase manually or remake tcfile.\n" ); if( fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON ) break; ++i; } if( h->auto_timebase_den ) { h->timebase_den = h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num; if( h->timebase_den > UINT32_MAX ) h->auto_timebase_den = 0; } return (double)fps_num / fps_den; } static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num ) { h->timebase_num = 0; h->timebase_den = MKV_TIMEBASE_DEN; for( int num = 0; num < loop_num; num++ ) { uint64_t fps_den; double exponent; double fps_sig = sigexp10( fpss[num], &exponent ); fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent; h->timebase_num = fps_den && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den; FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || !h->timebase_num, "automatic timebase generation failed.\n" " Specify timebase manually.\n" ); } return 0; } static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info ) { char buff[256]; int ret, tcfv, num, seq_num, timecodes_num; double *timecodes = NULL; double *fpss = NULL; ret = fgets( buff, sizeof(buff), tcfile_in ) != NULL && (sscanf( buff, "# timecode format v%d", &tcfv ) == 1 || sscanf( buff, "# timestamp format v%d", &tcfv ) == 1); FAIL_IF_ERROR( !ret || (tcfv != 1 && tcfv != 2), "unsupported timecode format\n" ); #define NO_TIMECODE_LINE (buff[0] == '#' || buff[0] == '\n' || buff[0] == '\r') if( tcfv == 1 ) { int64_t file_pos; double assume_fps, seq_fps; int start, end = -1; int prev_start = -1, prev_end = -1; h->assume_fps = 0; for( num = 2; fgets( buff, sizeof(buff), tcfile_in ) != NULL; num++ ) { if( NO_TIMECODE_LINE ) continue; FAIL_IF_ERROR( sscanf( buff, "assume %lf", &h->assume_fps ) != 1 && sscanf( buff, "Assume %lf", &h->assume_fps ) != 1, "tcfile parsing error: assumed fps not found\n" ); break; } FAIL_IF_ERROR( h->assume_fps <= 0, "invalid assumed fps %.6f\n", h->assume_fps ); file_pos = ftell( tcfile_in ); h->stored_pts_num = 0; for( seq_num = 0; fgets( buff, sizeof(buff), tcfile_in ) != NULL; num++ ) { if( NO_TIMECODE_LINE ) { if( sscanf( buff, "# TDecimate Mode 3: Last Frame = %d", &end ) == 1 ) h->stored_pts_num = end + 1; continue; } ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps ); FAIL_IF_ERROR( ret != 3 && ret != EOF, "invalid input tcfile\n" ); FAIL_IF_ERROR( start > end || start <= prev_start || end <= prev_end || seq_fps <= 0, "invalid input tcfile at line %d: %s\n", num, buff ); prev_start = start; prev_end = end; if( h->auto_timebase_den || h->auto_timebase_num ) ++seq_num; } if( !h->stored_pts_num ) h->stored_pts_num = end + 2; timecodes_num = h->stored_pts_num; fseek( tcfile_in, file_pos, SEEK_SET ); timecodes = malloc( timecodes_num * sizeof(double) ); if( !timecodes ) return -1; if( h->auto_timebase_den || h->auto_timebase_num ) { fpss = malloc( (seq_num + 1) * sizeof(double) ); if( !fpss ) goto fail; } assume_fps = correct_fps( h->assume_fps, h ); if( assume_fps < 0 ) goto fail; timecodes[0] = 0; for( num = seq_num = 0; num < timecodes_num - 1 && fgets( buff, sizeof(buff), tcfile_in ) != NULL; ) { if( NO_TIMECODE_LINE ) continue; ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps ); if( ret != 3 ) start = end = timecodes_num - 1; for( ; num < start && num < timecodes_num - 1; num++ ) timecodes[num + 1] = timecodes[num] + 1 / assume_fps; if( num < timecodes_num - 1 ) { if( h->auto_timebase_den || h->auto_timebase_num ) fpss[seq_num++] = seq_fps; seq_fps = correct_fps( seq_fps, h ); if( seq_fps < 0 ) goto fail; for( num = start; num <= end && num < timecodes_num - 1; num++ ) timecodes[num + 1] = timecodes[num] + 1 / seq_fps; } } for( ; num < timecodes_num - 1; num++ ) timecodes[num + 1] = timecodes[num] + 1 / assume_fps; if( h->auto_timebase_den || h->auto_timebase_num ) fpss[seq_num] = h->assume_fps; if( h->auto_timebase_num && !h->auto_timebase_den ) { double exponent; double assume_fps_sig, seq_fps_sig; if( try_mkv_timebase_den( fpss, h, seq_num + 1 ) < 0 ) goto fail; fseek( tcfile_in, file_pos, SEEK_SET ); assume_fps_sig = sigexp10( h->assume_fps, &exponent ); assume_fps = MKV_TIMEBASE_DEN / ( round( MKV_TIMEBASE_DEN / assume_fps_sig ) / exponent ); for( num = 0; num < timecodes_num - 1 && fgets( buff, sizeof(buff), tcfile_in ) != NULL; ) { if( NO_TIMECODE_LINE ) continue; ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps ); if( ret != 3 ) start = end = timecodes_num - 1; seq_fps_sig = sigexp10( seq_fps, &exponent ); seq_fps = MKV_TIMEBASE_DEN / ( round( MKV_TIMEBASE_DEN / seq_fps_sig ) / exponent ); for( ; num < start && num < timecodes_num - 1; num++ ) timecodes[num + 1] = timecodes[num] + 1 / assume_fps; for( num = start; num <= end && num < timecodes_num - 1; num++ ) timecodes[num + 1] = timecodes[num] + 1 / seq_fps; } for( ; num < timecodes_num - 1; num++ ) timecodes[num + 1] = timecodes[num] + 1 / assume_fps; } if( fpss ) { free( fpss ); fpss = NULL; } h->assume_fps = assume_fps; h->last_timecode = timecodes[timecodes_num - 1]; } else /* tcfv == 2 */ { int64_t file_pos = ftell( tcfile_in ); h->stored_pts_num = 0; while( fgets( buff, sizeof(buff), tcfile_in ) != NULL ) { if( NO_TIMECODE_LINE ) { if( !h->stored_pts_num ) file_pos = ftell( tcfile_in ); continue; } h->stored_pts_num++; } timecodes_num = h->stored_pts_num; FAIL_IF_ERROR( !timecodes_num, "input tcfile doesn't have any timecodes!\n" ); fseek( tcfile_in, file_pos, SEEK_SET ); timecodes = malloc( timecodes_num * sizeof(double) ); if( !timecodes ) return -1; num = 0; if( fgets( buff, sizeof(buff), tcfile_in ) != NULL ) { ret = sscanf( buff, "%lf", &timecodes[0] ); timecodes[0] *= 1e-3; /* Timecode format v2 is expressed in milliseconds. */ FAIL_IF_ERROR( ret != 1, "invalid input tcfile for frame 0\n" ); for( num = 1; num < timecodes_num && fgets( buff, sizeof(buff), tcfile_in ) != NULL; ) { if( NO_TIMECODE_LINE ) continue; ret = sscanf( buff, "%lf", &timecodes[num] ); timecodes[num] *= 1e-3; /* Timecode format v2 is expressed in milliseconds. */ FAIL_IF_ERROR( ret != 1 || timecodes[num] <= timecodes[num - 1], "invalid input tcfile for frame %d\n", num ); ++num; } } FAIL_IF_ERROR( num < timecodes_num, "failed to read input tcfile for frame %d", num ); if( timecodes_num == 1 ) h->timebase_den = info->fps_num; else if( h->auto_timebase_den ) { fpss = malloc( (timecodes_num - 1) * sizeof(double) ); if( !fpss ) goto fail; for( num = 0; num < timecodes_num - 1; num++ ) { fpss[num] = 1 / (timecodes[num + 1] - timecodes[num]); if( h->auto_timebase_den ) { int i = 1; uint64_t fps_num, fps_den; double exponent; double fps_sig = sigexp10( fpss[num], &exponent ); while( 1 ) { fps_den = i * h->timebase_num; fps_num = round( fps_den * fps_sig ) * exponent; if( fps_num > UINT32_MAX || fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON ) break; ++i; } h->timebase_den = fps_num && h->timebase_den ? lcm( h->timebase_den, fps_num ) : fps_num; if( h->timebase_den > UINT32_MAX ) { h->auto_timebase_den = 0; continue; } } } if( h->auto_timebase_num && !h->auto_timebase_den ) if( try_mkv_timebase_den( fpss, h, timecodes_num - 1 ) < 0 ) goto fail; free( fpss ); fpss = NULL; } if( timecodes_num > 1 ) h->assume_fps = 1 / (timecodes[timecodes_num - 1] - timecodes[timecodes_num - 2]); else h->assume_fps = (double)info->fps_num / info->fps_den; h->last_timecode = timecodes[timecodes_num - 1]; } #undef NO_TIMECODE_LINE if( h->auto_timebase_den || h->auto_timebase_num ) { uint64_t i = gcd( h->timebase_num, h->timebase_den ); h->timebase_num /= i; h->timebase_den /= i; x264_cli_log( "timecode", X264_LOG_INFO, "automatic timebase generation %"PRIu64"/%"PRIu64"\n", h->timebase_num, h->timebase_den ); } else FAIL_IF_ERROR( h->timebase_den > UINT32_MAX || !h->timebase_den, "automatic timebase generation failed.\n" " Specify an appropriate timebase manually.\n" ); h->pts = malloc( h->stored_pts_num * sizeof(int64_t) ); if( !h->pts ) goto fail; for( num = 0; num < h->stored_pts_num; num++ ) { h->pts[num] = timecodes[num] * ((double)h->timebase_den / h->timebase_num) + 0.5; FAIL_IF_ERROR( num > 0 && h->pts[num] <= h->pts[num - 1], "invalid timebase or timecode for frame %d\n", num ); } free( timecodes ); return 0; fail: if( timecodes ) free( timecodes ); if( fpss ) free( fpss ); return -1; } #undef DOUBLE_EPSILON #undef MKV_TIMEBASE_DEN static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { int ret = 0; FILE *tcfile_in; timecode_hnd_t *h = malloc( sizeof(timecode_hnd_t) ); FAIL_IF_ERROR( !h, "malloc failed\n" ); h->input = cli_input; h->p_handle = *p_handle; h->pts = NULL; if( opt->timebase ) { ret = sscanf( opt->timebase, "%"SCNu64"/%"SCNu64, &h->timebase_num, &h->timebase_den ); if( ret == 1 ) { h->timebase_num = strtoul( opt->timebase, NULL, 10 ); h->timebase_den = 0; /* set later by auto timebase generation */ } FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || h->timebase_den > UINT32_MAX, "timebase you specified exceeds H.264 maximum\n" ); } h->auto_timebase_num = !ret; h->auto_timebase_den = ret < 2; if( h->auto_timebase_num ) h->timebase_num = info->fps_den; /* can be changed later by auto timebase generation */ if( h->auto_timebase_den ) h->timebase_den = 0; /* set later by auto timebase generation */ tcfile_in = x264_fopen( psz_filename, "rb" ); FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename ); if( !x264_is_regular_file( tcfile_in ) ) { x264_cli_log( "timecode", X264_LOG_ERROR, "tcfile input incompatible with non-regular file `%s'\n", psz_filename ); fclose( tcfile_in ); return -1; } if( parse_tcfile( tcfile_in, h, info ) < 0 ) { if( h->pts ) free( h->pts ); fclose( tcfile_in ); return -1; } fclose( tcfile_in ); info->timebase_num = h->timebase_num; info->timebase_den = h->timebase_den; info->vfr = 1; *p_handle = h; return 0; } static int64_t get_frame_pts( timecode_hnd_t *h, int frame, int real_frame ) { if( frame < h->stored_pts_num ) return h->pts[frame]; else { if( h->pts && real_frame ) { x264_cli_log( "timecode", X264_LOG_INFO, "input timecode file missing data for frame %d and later\n" " assuming constant fps %.6f\n", frame, h->assume_fps ); free( h->pts ); h->pts = NULL; } double timecode = h->last_timecode + 1 / h->assume_fps; if( real_frame ) h->last_timecode = timecode; return timecode * ((double)h->timebase_den / h->timebase_num) + 0.5; } } static int read_frame( cli_pic_t *pic, hnd_t handle, int frame ) { timecode_hnd_t *h = handle; if( h->input.read_frame( pic, h->p_handle, frame ) ) return -1; pic->pts = get_frame_pts( h, frame, 1 ); pic->duration = get_frame_pts( h, frame + 1, 0 ) - pic->pts; return 0; } static int release_frame( cli_pic_t *pic, hnd_t handle ) { timecode_hnd_t *h = handle; if( h->input.release_frame ) return h->input.release_frame( pic, h->p_handle ); return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { timecode_hnd_t *h = handle; return h->input.picture_alloc( pic, h->p_handle, csp, width, height ); } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { timecode_hnd_t *h = handle; h->input.picture_clean( pic, h->p_handle ); } static int close_file( hnd_t handle ) { timecode_hnd_t *h = handle; if( h->pts ) free( h->pts ); h->input.close_file( h->p_handle ); free( h ); return 0; } const cli_input_t timecode_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; x264-master/input/y4m.c000066400000000000000000000304171502133446700151000ustar00rootroot00000000000000/***************************************************************************** * y4m.c: y4m input ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "input.h" #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "y4m", __VA_ARGS__ ) typedef struct { FILE *fh; int next_frame; int seq_header_len; int frame_header_len; int64_t frame_size; int64_t plane_size[3]; int bit_depth; cli_mmap_t mmap; int use_mmap; } y4m_hnd_t; #define Y4M_MAGIC "YUV4MPEG2" #define Y4M_FRAME_MAGIC "FRAME" #define Y4M_MAX_HEADER 256 static int parse_csp_and_depth( char *csp_name, int *bit_depth ) { int csp = X264_CSP_MAX; /* Set colorspace from known variants */ if( !strncmp( "mono", csp_name, 4 ) ) csp = X264_CSP_I400; else if( !strncmp( "420", csp_name, 3 ) ) csp = X264_CSP_I420; else if( !strncmp( "422", csp_name, 3 ) ) csp = X264_CSP_I422; else if( !strncmp( "444", csp_name, 3 ) && strncmp( "444alpha", csp_name, 8 ) ) // only accept alphaless 4:4:4 csp = X264_CSP_I444; /* Set high bit depth from known extensions */ if( sscanf( csp_name, "mono%d", bit_depth ) != 1 && sscanf( csp_name, "%*d%*[pP]%d", bit_depth ) != 1 ) *bit_depth = 8; return csp; } static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt ) { y4m_hnd_t *h = calloc( 1, sizeof(y4m_hnd_t) ); int i; uint32_t n, d; char header[Y4M_MAX_HEADER+10]; char *tokend, *header_end; int colorspace = X264_CSP_NONE; int alt_colorspace = X264_CSP_NONE; int alt_bit_depth = 8; if( !h ) return -1; info->vfr = 0; if( !strcmp( psz_filename, "-" ) ) h->fh = stdin; else h->fh = x264_fopen(psz_filename, "rb"); if( h->fh == NULL ) return -1; /* Read header */ for( i = 0; i < Y4M_MAX_HEADER; i++ ) { header[i] = fgetc( h->fh ); if( header[i] == '\n' ) { /* Add a space after last option. Makes parsing "444" vs "444alpha" easier. */ header[i+1] = 0x20; header[i+2] = 0; break; } } FAIL_IF_ERROR( strncmp( header, Y4M_MAGIC, sizeof(Y4M_MAGIC)-1 ), "bad sequence header magic\n" ); FAIL_IF_ERROR( i == Y4M_MAX_HEADER, "bad sequence header length\n" ); /* Scan properties */ header_end = &header[i+1]; /* Include space */ h->seq_header_len = i+1; for( char *tokstart = header + sizeof(Y4M_MAGIC); tokstart < header_end; tokstart++ ) { if( *tokstart == 0x20 ) continue; switch( *tokstart++ ) { case 'W': /* Width. Required. */ info->width = strtol( tokstart, &tokend, 10 ); tokstart=tokend; break; case 'H': /* Height. Required. */ info->height = strtol( tokstart, &tokend, 10 ); tokstart=tokend; break; case 'C': /* Color space */ colorspace = parse_csp_and_depth( tokstart, &h->bit_depth ); tokstart = strchr( tokstart, 0x20 ); break; case 'I': /* Interlace type */ switch( *tokstart++ ) { case 't': info->interlaced = 1; info->tff = 1; break; case 'b': info->interlaced = 1; info->tff = 0; break; case 'm': info->interlaced = 1; break; //case '?': //case 'p': default: break; } break; case 'F': /* Frame rate - 0:0 if unknown */ if( sscanf( tokstart, "%u:%u", &n, &d ) == 2 && n && d ) { x264_reduce_fraction( &n, &d ); info->fps_num = n; info->fps_den = d; } tokstart = strchr( tokstart, 0x20 ); break; case 'A': /* Pixel aspect - 0:0 if unknown */ /* Don't override the aspect ratio if sar has been explicitly set on the commandline. */ if( sscanf( tokstart, "%u:%u", &n, &d ) == 2 && n && d ) { x264_reduce_fraction( &n, &d ); info->sar_width = n; info->sar_height = d; } tokstart = strchr( tokstart, 0x20 ); break; case 'X': /* Vendor extensions */ if( !strncmp( "YSCSS=", tokstart, 6 ) ) { /* Older nonstandard pixel format representation */ tokstart += 6; alt_colorspace = parse_csp_and_depth( tokstart, &alt_bit_depth ); } else if( !strncmp( "COLORRANGE=", tokstart, 11 ) ) { /* ffmpeg's color range extension */ tokstart += 11; if( !strncmp( "FULL", tokstart, 4 ) ) info->fullrange = 1; else if( !strncmp( "LIMITED", tokstart, 7 ) ) info->fullrange = 0; } tokstart = strchr( tokstart, 0x20 ); break; } } if( colorspace == X264_CSP_NONE ) { colorspace = alt_colorspace; h->bit_depth = alt_bit_depth; } // default to 8bit 4:2:0 if nothing is specified if( colorspace == X264_CSP_NONE ) { colorspace = X264_CSP_I420; h->bit_depth = 8; } FAIL_IF_ERROR( colorspace <= X264_CSP_NONE || colorspace >= X264_CSP_MAX, "colorspace unhandled\n" ); FAIL_IF_ERROR( h->bit_depth < 8 || h->bit_depth > 16, "unsupported bit depth `%d'\n", h->bit_depth ); info->thread_safe = 1; info->num_frames = 0; info->csp = colorspace; if( h->bit_depth > 8 ) info->csp |= X264_CSP_HIGH_DEPTH; const x264_cli_csp_t *csp = x264_cli_get_csp( info->csp ); for( i = 0; i < csp->planes; i++ ) { h->plane_size[i] = x264_cli_pic_plane_size( info->csp, info->width, info->height, i ); h->frame_size += h->plane_size[i]; /* x264_cli_pic_plane_size returns the size in bytes, we need the value in pixels from here on */ h->plane_size[i] /= x264_cli_csp_depth_factor( info->csp ); } if( x264_is_regular_file( h->fh ) ) { int64_t init_pos = ftell( h->fh ); /* Find out the length of the frame header */ size_t len = 1; while( len <= Y4M_MAX_HEADER && fgetc( h->fh ) != '\n' ) len++; FAIL_IF_ERROR( len > Y4M_MAX_HEADER || len < sizeof(Y4M_FRAME_MAGIC), "bad frame header length\n" ); h->frame_header_len = len; h->frame_size += len; fseek( h->fh, 0, SEEK_END ); int64_t i_size = ftell( h->fh ); fseek( h->fh, init_pos, SEEK_SET ); info->num_frames = (i_size - h->seq_header_len) / h->frame_size; FAIL_IF_ERROR( !info->num_frames, "empty input file\n" ); /* Attempt to use memory-mapped input frames if possible */ if( !(h->bit_depth & 7) ) h->use_mmap = !x264_cli_mmap_init( &h->mmap, h->fh ); } *p_handle = h; return 0; } static int read_frame_internal( cli_pic_t *pic, y4m_hnd_t *h, int bit_depth_uc ) { static const size_t slen = sizeof(Y4M_FRAME_MAGIC)-1; int pixel_depth = x264_cli_csp_depth_factor( pic->img.csp ); int i = sizeof(Y4M_FRAME_MAGIC); char header_buf[16]; char *header; /* Verify that the frame header is valid */ if( h->use_mmap ) { header = (char*)pic->img.plane[0]; pic->img.plane[0] += h->frame_header_len; /* If the header length has changed between frames the size of the mapping will be invalid. * It might be possible to work around it, but I'm not aware of any tool beside fuzzers that * produces y4m files with variable-length frame headers so just error out if that happens. */ while( i <= h->frame_header_len && header[i-1] != '\n' ) i++; FAIL_IF_ERROR( i != h->frame_header_len, "bad frame header length\n" ); } else { header = header_buf; if( fread( header, 1, slen, h->fh ) != slen ) return -1; while( i <= Y4M_MAX_HEADER && fgetc( h->fh ) != '\n' ) i++; FAIL_IF_ERROR( i > Y4M_MAX_HEADER, "bad frame header length\n" ); } FAIL_IF_ERROR( memcmp( header, Y4M_FRAME_MAGIC, slen ), "bad frame header magic\n" ); for( i = 0; i < pic->img.planes; i++ ) { if( h->use_mmap ) { if( i ) pic->img.plane[i] = pic->img.plane[i-1] + pixel_depth * h->plane_size[i-1]; } else if( fread( pic->img.plane[i], pixel_depth, h->plane_size[i], h->fh ) != (uint64_t)h->plane_size[i] ) return -1; if( bit_depth_uc ) { /* upconvert non 16bit high depth planes to 16bit using the same * algorithm as used in the depth filter. */ uint16_t *plane = (uint16_t*)pic->img.plane[i]; int64_t pixel_count = h->plane_size[i]; int lshift = 16 - h->bit_depth; for( int64_t j = 0; j < pixel_count; j++ ) plane[j] = plane[j] << lshift; } } return 0; } static int read_frame( cli_pic_t *pic, hnd_t handle, int i_frame ) { y4m_hnd_t *h = handle; if( h->use_mmap ) { pic->img.plane[0] = x264_cli_mmap( &h->mmap, h->frame_size * i_frame + h->seq_header_len, h->frame_size ); if( !pic->img.plane[0] ) return -1; } else if( i_frame > h->next_frame ) { if( x264_is_regular_file( h->fh ) ) fseek( h->fh, h->frame_size * i_frame + h->seq_header_len, SEEK_SET ); else while( i_frame > h->next_frame ) { if( read_frame_internal( pic, h, 0 ) ) return -1; h->next_frame++; } } if( read_frame_internal( pic, h, h->bit_depth & 7 ) ) return -1; h->next_frame = i_frame+1; return 0; } static int release_frame( cli_pic_t *pic, hnd_t handle ) { y4m_hnd_t *h = handle; if( h->use_mmap ) return x264_cli_munmap( &h->mmap, pic->img.plane[0] - h->frame_header_len, h->frame_size ); return 0; } static int picture_alloc( cli_pic_t *pic, hnd_t handle, int csp, int width, int height ) { y4m_hnd_t *h = handle; return (h->use_mmap ? x264_cli_pic_init_noalloc : x264_cli_pic_alloc)( pic, csp, width, height ); } static void picture_clean( cli_pic_t *pic, hnd_t handle ) { y4m_hnd_t *h = handle; if( h->use_mmap ) memset( pic, 0, sizeof(cli_pic_t) ); else x264_cli_pic_clean( pic ); } static int close_file( hnd_t handle ) { y4m_hnd_t *h = handle; if( !h || !h->fh ) return 0; if( h->use_mmap ) x264_cli_mmap_close( &h->mmap ); fclose( h->fh ); free( h ); return 0; } const cli_input_t y4m_input = { open_file, picture_alloc, read_frame, release_frame, picture_clean, close_file }; x264-master/output/000077500000000000000000000000001502133446700144175ustar00rootroot00000000000000x264-master/output/flv.c000066400000000000000000000257731502133446700153700ustar00rootroot00000000000000/***************************************************************************** * flv.c: flv muxer ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Kieran Kunhya * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" #include "flv_bytestream.h" #define CHECK(x)\ do {\ if( (x) < 0 )\ return -1;\ } while( 0 ) typedef struct { flv_buffer *c; uint8_t *sei; int sei_len; int64_t i_fps_num; int64_t i_fps_den; int64_t i_framenum; uint64_t i_framerate_pos; uint64_t i_duration_pos; uint64_t i_filesize_pos; uint64_t i_bitrate_pos; uint8_t b_write_length; int64_t i_prev_dts; int64_t i_prev_cts; int64_t i_delay_time; int64_t i_init_delta; int i_delay_frames; double d_timebase; int b_vfr_input; int b_dts_compress; unsigned start; } flv_hnd_t; static int write_header( flv_buffer *c ) { flv_put_tag( c, "FLV" ); // Signature flv_put_byte( c, 1 ); // Version flv_put_byte( c, 1 ); // Video Only flv_put_be32( c, 9 ); // DataOffset flv_put_be32( c, 0 ); // PreviousTagSize0 return flv_flush_data( c ); } static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ) { flv_hnd_t *p_flv = calloc( 1, sizeof(flv_hnd_t) ); if( p_flv ) { flv_buffer *c = flv_create_writer( psz_filename ); if( c ) { if( !write_header( c ) ) { p_flv->c = c; p_flv->b_dts_compress = opt->use_dts_compress; *p_handle = p_flv; return 0; } fclose( c->fp ); free( c->data ); free( c ); } free( p_flv ); } *p_handle = NULL; return -1; } static int set_param( hnd_t handle, x264_param_t *p_param ) { flv_hnd_t *p_flv = handle; flv_buffer *c = p_flv->c; flv_put_byte( c, FLV_TAG_TYPE_META ); // Tag Type "script data" int start = c->d_cur; flv_put_be24( c, 0 ); // data length flv_put_be24( c, 0 ); // timestamp flv_put_be32( c, 0 ); // reserved flv_put_byte( c, AMF_DATA_TYPE_STRING ); flv_put_amf_string( c, "onMetaData" ); flv_put_byte( c, AMF_DATA_TYPE_MIXEDARRAY ); flv_put_be32( c, 7 ); flv_put_amf_string( c, "width" ); flv_put_amf_double( c, p_param->i_width ); flv_put_amf_string( c, "height" ); flv_put_amf_double( c, p_param->i_height ); flv_put_amf_string( c, "framerate" ); if( !p_param->b_vfr_input ) flv_put_amf_double( c, (double)p_param->i_fps_num / p_param->i_fps_den ); else { p_flv->i_framerate_pos = c->d_cur + c->d_total + 1; flv_put_amf_double( c, 0 ); // written at end of encoding } flv_put_amf_string( c, "videocodecid" ); flv_put_amf_double( c, FLV_CODECID_H264 ); flv_put_amf_string( c, "duration" ); p_flv->i_duration_pos = c->d_cur + c->d_total + 1; flv_put_amf_double( c, 0 ); // written at end of encoding flv_put_amf_string( c, "filesize" ); p_flv->i_filesize_pos = c->d_cur + c->d_total + 1; flv_put_amf_double( c, 0 ); // written at end of encoding flv_put_amf_string( c, "videodatarate" ); p_flv->i_bitrate_pos = c->d_cur + c->d_total + 1; flv_put_amf_double( c, 0 ); // written at end of encoding flv_put_amf_string( c, "" ); flv_put_byte( c, AMF_END_OF_OBJECT ); unsigned length = c->d_cur - start; flv_rewrite_amf_be24( c, length - 10, start ); flv_put_be32( c, length + 1 ); // tag length p_flv->i_fps_num = p_param->i_fps_num; p_flv->i_fps_den = p_param->i_fps_den; p_flv->d_timebase = (double)p_param->i_timebase_num / p_param->i_timebase_den; p_flv->b_vfr_input = p_param->b_vfr_input; p_flv->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0; return 0; } static int write_headers( hnd_t handle, x264_nal_t *p_nal ) { flv_hnd_t *p_flv = handle; flv_buffer *c = p_flv->c; int sps_size = p_nal[0].i_payload; int pps_size = p_nal[1].i_payload; int sei_size = p_nal[2].i_payload; // SEI /* It is within the spec to write this as-is but for * mplayer/ffmpeg playback this is deferred until before the first frame */ p_flv->sei = malloc( sei_size ); if( !p_flv->sei ) return -1; p_flv->sei_len = sei_size; memcpy( p_flv->sei, p_nal[2].p_payload, sei_size ); // SPS uint8_t *sps = p_nal[0].p_payload + 4; flv_put_byte( c, FLV_TAG_TYPE_VIDEO ); flv_put_be24( c, 0 ); // rewrite later flv_put_be24( c, 0 ); // timestamp flv_put_byte( c, 0 ); // timestamp extended flv_put_be24( c, 0 ); // StreamID - Always 0 p_flv->start = c->d_cur; // needed for overwriting length flv_put_byte( c, FLV_FRAME_KEY | FLV_CODECID_H264 ); // FrameType and CodecID flv_put_byte( c, 0 ); // AVC sequence header flv_put_be24( c, 0 ); // composition time flv_put_byte( c, 1 ); // version flv_put_byte( c, sps[1] ); // profile flv_put_byte( c, sps[2] ); // profile flv_put_byte( c, sps[3] ); // level flv_put_byte( c, 0xff ); // 6 bits reserved (111111) + 2 bits nal size length - 1 (11) flv_put_byte( c, 0xe1 ); // 3 bits reserved (111) + 5 bits number of sps (00001) flv_put_be16( c, sps_size - 4 ); flv_append_data( c, sps, sps_size - 4 ); // PPS flv_put_byte( c, 1 ); // number of pps flv_put_be16( c, pps_size - 4 ); flv_append_data( c, p_nal[1].p_payload + 4, pps_size - 4 ); // rewrite data length info unsigned length = c->d_cur - p_flv->start; flv_rewrite_amf_be24( c, length, p_flv->start - 10 ); flv_put_be32( c, length + 11 ); // Last tag size CHECK( flv_flush_data( c ) ); return sei_size + sps_size + pps_size; } static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture ) { flv_hnd_t *p_flv = handle; flv_buffer *c = p_flv->c; #define convert_timebase_ms( timestamp, timebase ) (int64_t)((timestamp) * (timebase) * 1000 + 0.5) if( !p_flv->i_framenum ) { p_flv->i_delay_time = p_picture->i_dts * -1; if( !p_flv->b_dts_compress && p_flv->i_delay_time ) x264_cli_log( "flv", X264_LOG_INFO, "initial delay %"PRId64" ms\n", convert_timebase_ms( p_picture->i_pts + p_flv->i_delay_time, p_flv->d_timebase ) ); } int64_t dts; int64_t cts; int64_t offset; if( p_flv->b_dts_compress ) { if( p_flv->i_framenum == 1 ) p_flv->i_init_delta = convert_timebase_ms( p_picture->i_dts + p_flv->i_delay_time, p_flv->d_timebase ); dts = p_flv->i_framenum > p_flv->i_delay_frames ? convert_timebase_ms( p_picture->i_dts, p_flv->d_timebase ) : p_flv->i_framenum * p_flv->i_init_delta / (p_flv->i_delay_frames + 1); cts = convert_timebase_ms( p_picture->i_pts, p_flv->d_timebase ); } else { dts = convert_timebase_ms( p_picture->i_dts + p_flv->i_delay_time, p_flv->d_timebase ); cts = convert_timebase_ms( p_picture->i_pts + p_flv->i_delay_time, p_flv->d_timebase ); } offset = cts - dts; if( p_flv->i_framenum ) { if( p_flv->i_prev_dts == dts ) x264_cli_log( "flv", X264_LOG_WARNING, "duplicate DTS %"PRId64" generated by rounding\n" " decoding framerate cannot exceed 1000fps\n", dts ); if( p_flv->i_prev_cts == cts ) x264_cli_log( "flv", X264_LOG_WARNING, "duplicate CTS %"PRId64" generated by rounding\n" " composition framerate cannot exceed 1000fps\n", cts ); } p_flv->i_prev_dts = dts; p_flv->i_prev_cts = cts; // A new frame - write packet header flv_put_byte( c, FLV_TAG_TYPE_VIDEO ); flv_put_be24( c, 0 ); // calculated later flv_put_be24( c, dts ); flv_put_byte( c, dts >> 24 ); flv_put_be24( c, 0 ); p_flv->start = c->d_cur; flv_put_byte( c, (p_picture->b_keyframe ? FLV_FRAME_KEY : FLV_FRAME_INTER) | FLV_CODECID_H264 ); flv_put_byte( c, 1 ); // AVC NALU flv_put_be24( c, offset ); if( p_flv->sei ) { flv_append_data( c, p_flv->sei, p_flv->sei_len ); free( p_flv->sei ); p_flv->sei = NULL; } flv_append_data( c, p_nalu, i_size ); unsigned length = c->d_cur - p_flv->start; flv_rewrite_amf_be24( c, length, p_flv->start - 10 ); flv_put_be32( c, 11 + length ); // Last tag size CHECK( flv_flush_data( c ) ); p_flv->i_framenum++; return i_size; } static int rewrite_amf_double( FILE *fp, uint64_t position, double value ) { uint64_t x = endian_fix64( flv_dbl2int( value ) ); return !fseek( fp, position, SEEK_SET ) && fwrite( &x, 8, 1, fp ) == 1 ? 0 : -1; } #undef CHECK #define CHECK(x)\ do {\ if( (x) < 0 )\ goto error;\ } while( 0 ) static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ) { int ret = -1; flv_hnd_t *p_flv = handle; flv_buffer *c = p_flv->c; CHECK( flv_flush_data( c ) ); double total_duration; /* duration algorithm fails with one frame */ if( p_flv->i_framenum == 1 ) total_duration = p_flv->i_fps_num ? (double)p_flv->i_fps_den / p_flv->i_fps_num : 0; else total_duration = (2 * largest_pts - second_largest_pts) * p_flv->d_timebase; if( x264_is_regular_file( c->fp ) && total_duration > 0 ) { double framerate; int64_t filesize = ftell( c->fp ); if( p_flv->i_framerate_pos ) { framerate = (double)p_flv->i_framenum / total_duration; CHECK( rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate ) ); } CHECK( rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration ) ); CHECK( rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize ) ); CHECK( rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8.0 / ( total_duration * 1000 ) ) ); } ret = 0; error: fclose( c->fp ); free( c->data ); free( c ); free( p_flv ); return ret; } const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file }; x264-master/output/flv_bytestream.c000066400000000000000000000071201502133446700176110ustar00rootroot00000000000000/***************************************************************************** * flv_bytestream.c: flv muxer utilities ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Kieran Kunhya * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" #include "flv_bytestream.h" uint64_t flv_dbl2int( double value ) { return (union {double f; uint64_t i;}){value}.i; } /* Put functions */ void flv_put_byte( flv_buffer *c, uint8_t b ) { flv_append_data( c, &b, 1 ); } void flv_put_be32( flv_buffer *c, uint32_t val ) { flv_put_byte( c, val >> 24 ); flv_put_byte( c, val >> 16 ); flv_put_byte( c, val >> 8 ); flv_put_byte( c, val ); } void flv_put_be64( flv_buffer *c, uint64_t val ) { flv_put_be32( c, val >> 32 ); flv_put_be32( c, val ); } void flv_put_be16( flv_buffer *c, uint16_t val ) { flv_put_byte( c, val >> 8 ); flv_put_byte( c, val ); } void flv_put_be24( flv_buffer *c, uint32_t val ) { flv_put_be16( c, val >> 8 ); flv_put_byte( c, val ); } void flv_put_tag( flv_buffer *c, const char *tag ) { while( *tag ) flv_put_byte( c, *tag++ ); } void flv_put_amf_string( flv_buffer *c, const char *str ) { uint16_t len = strlen( str ); flv_put_be16( c, len ); flv_append_data( c, (uint8_t*)str, len ); } void flv_put_amf_double( flv_buffer *c, double d ) { flv_put_byte( c, AMF_DATA_TYPE_NUMBER ); flv_put_be64( c, flv_dbl2int( d ) ); } /* flv writing functions */ flv_buffer *flv_create_writer( const char *filename ) { flv_buffer *c = calloc( 1, sizeof(flv_buffer) ); if( !c ) return NULL; if( !strcmp( filename, "-" ) ) c->fp = stdout; else c->fp = x264_fopen( filename, "wb" ); if( !c->fp ) { free( c ); return NULL; } return c; } int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size ) { unsigned ns = c->d_cur + size; if( ns > c->d_max ) { void *dp; unsigned dn = 16; while( ns > dn ) dn <<= 1; dp = realloc( c->data, dn ); if( !dp ) return -1; c->data = dp; c->d_max = dn; } memcpy( c->data + c->d_cur, data, size ); c->d_cur = ns; return 0; } void flv_rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start ) { *(c->data + start + 0) = length >> 16; *(c->data + start + 1) = length >> 8; *(c->data + start + 2) = length >> 0; } int flv_flush_data( flv_buffer *c ) { if( !c->d_cur ) return 0; if( fwrite( c->data, c->d_cur, 1, c->fp ) != 1 ) return -1; c->d_total += c->d_cur; c->d_cur = 0; return 0; } x264-master/output/flv_bytestream.h000066400000000000000000000100141502133446700176120ustar00rootroot00000000000000/***************************************************************************** * flv_bytestream.h: flv muxer utilities ***************************************************************************** * Copyright (C) 2009-2025 x264 project * * Authors: Kieran Kunhya * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_FLV_BYTESTREAM_H #define X264_FLV_BYTESTREAM_H /* offsets for packed values */ #define FLV_AUDIO_SAMPLESSIZE_OFFSET 1 #define FLV_AUDIO_SAMPLERATE_OFFSET 2 #define FLV_AUDIO_CODECID_OFFSET 4 #define FLV_VIDEO_FRAMETYPE_OFFSET 4 /* bitmasks to isolate specific values */ #define FLV_AUDIO_CHANNEL_MASK 0x01 #define FLV_AUDIO_SAMPLESIZE_MASK 0x02 #define FLV_AUDIO_SAMPLERATE_MASK 0x0c #define FLV_AUDIO_CODECID_MASK 0xf0 #define FLV_VIDEO_CODECID_MASK 0x0f #define FLV_VIDEO_FRAMETYPE_MASK 0xf0 #define AMF_END_OF_OBJECT 0x09 enum { FLV_HEADER_FLAG_HASVIDEO = 1, FLV_HEADER_FLAG_HASAUDIO = 4, }; enum { FLV_TAG_TYPE_AUDIO = 0x08, FLV_TAG_TYPE_VIDEO = 0x09, FLV_TAG_TYPE_META = 0x12, }; enum { FLV_MONO = 0, FLV_STEREO = 1, }; enum { FLV_SAMPLESSIZE_8BIT = 0, FLV_SAMPLESSIZE_16BIT = 1 << FLV_AUDIO_SAMPLESSIZE_OFFSET, }; enum { FLV_SAMPLERATE_SPECIAL = 0, /**< signifies 5512Hz and 8000Hz in the case of NELLYMOSER */ FLV_SAMPLERATE_11025HZ = 1 << FLV_AUDIO_SAMPLERATE_OFFSET, FLV_SAMPLERATE_22050HZ = 2 << FLV_AUDIO_SAMPLERATE_OFFSET, FLV_SAMPLERATE_44100HZ = 3 << FLV_AUDIO_SAMPLERATE_OFFSET, }; enum { FLV_CODECID_MP3 = 2 << FLV_AUDIO_CODECID_OFFSET, FLV_CODECID_AAC = 10<< FLV_AUDIO_CODECID_OFFSET, }; enum { FLV_CODECID_H264 = 7, }; enum { FLV_FRAME_KEY = 1 << FLV_VIDEO_FRAMETYPE_OFFSET, FLV_FRAME_INTER = 2 << FLV_VIDEO_FRAMETYPE_OFFSET, }; typedef enum { AMF_DATA_TYPE_NUMBER = 0x00, AMF_DATA_TYPE_BOOL = 0x01, AMF_DATA_TYPE_STRING = 0x02, AMF_DATA_TYPE_OBJECT = 0x03, AMF_DATA_TYPE_NULL = 0x05, AMF_DATA_TYPE_UNDEFINED = 0x06, AMF_DATA_TYPE_REFERENCE = 0x07, AMF_DATA_TYPE_MIXEDARRAY = 0x08, AMF_DATA_TYPE_OBJECT_END = 0x09, AMF_DATA_TYPE_ARRAY = 0x0a, AMF_DATA_TYPE_DATE = 0x0b, AMF_DATA_TYPE_LONG_STRING = 0x0c, AMF_DATA_TYPE_UNSUPPORTED = 0x0d, } AMFDataType; typedef struct flv_buffer { uint8_t *data; unsigned d_cur; unsigned d_max; FILE *fp; uint64_t d_total; } flv_buffer; flv_buffer *flv_create_writer( const char *filename ); int flv_append_data( flv_buffer *c, uint8_t *data, unsigned size ); int flv_write_byte( flv_buffer *c, uint8_t *byte ); int flv_flush_data( flv_buffer *c ); void flv_rewrite_amf_be24( flv_buffer *c, unsigned length, unsigned start ); uint64_t flv_dbl2int( double value ); void flv_put_byte( flv_buffer *c, uint8_t b ); void flv_put_be32( flv_buffer *c, uint32_t val ); void flv_put_be64( flv_buffer *c, uint64_t val ); void flv_put_be16( flv_buffer *c, uint16_t val ); void flv_put_be24( flv_buffer *c, uint32_t val ); void flv_put_tag( flv_buffer *c, const char *tag ); void flv_put_amf_string( flv_buffer *c, const char *str ); void flv_put_amf_double( flv_buffer *c, double d ); #endif x264-master/output/matroska.c000066400000000000000000000143421502133446700164100ustar00rootroot00000000000000/***************************************************************************** * matroska.c: matroska muxer ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Mike Matsnev * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" #include "matroska_ebml.h" typedef struct { mk_writer *w; int width, height, d_width, d_height; int display_size_units; int stereo_mode; int64_t frame_duration; char b_writing_frame; uint32_t i_timebase_num; uint32_t i_timebase_den; } mkv_hnd_t; static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ) { *p_handle = NULL; mkv_hnd_t *p_mkv = calloc( 1, sizeof(mkv_hnd_t) ); if( !p_mkv ) return -1; p_mkv->w = mk_create_writer( psz_filename ); if( !p_mkv->w ) { free( p_mkv ); return -1; } *p_handle = p_mkv; return 0; } #define STEREO_COUNT 7 static const uint8_t stereo_modes[STEREO_COUNT] = {5,9,7,1,3,13,0}; static const uint8_t stereo_w_div[STEREO_COUNT] = {1,2,1,2,1,1,1}; static const uint8_t stereo_h_div[STEREO_COUNT] = {1,1,2,1,2,1,1}; static int set_param( hnd_t handle, x264_param_t *p_param ) { mkv_hnd_t *p_mkv = handle; int64_t dw, dh; if( p_param->i_fps_num > 0 && !p_param->b_vfr_input ) { p_mkv->frame_duration = (int64_t)p_param->i_fps_den * (int64_t)1000000000 / p_param->i_fps_num; } else { p_mkv->frame_duration = 0; } dw = p_mkv->width = p_param->i_width; dh = p_mkv->height = p_param->i_height; p_mkv->display_size_units = DS_PIXELS; p_mkv->stereo_mode = -1; if( p_param->i_frame_packing >= 0 && p_param->i_frame_packing < STEREO_COUNT ) { p_mkv->stereo_mode = stereo_modes[p_param->i_frame_packing]; dw /= stereo_w_div[p_param->i_frame_packing]; dh /= stereo_h_div[p_param->i_frame_packing]; } if( p_param->vui.i_sar_width && p_param->vui.i_sar_height && p_param->vui.i_sar_width != p_param->vui.i_sar_height ) { if( p_param->vui.i_sar_width > p_param->vui.i_sar_height ) { dw = dw * p_param->vui.i_sar_width / p_param->vui.i_sar_height; } else { dh = dh * p_param->vui.i_sar_height / p_param->vui.i_sar_width; } } p_mkv->d_width = (int)dw; p_mkv->d_height = (int)dh; p_mkv->i_timebase_num = p_param->i_timebase_num; p_mkv->i_timebase_den = p_param->i_timebase_den; return 0; } static int write_headers( hnd_t handle, x264_nal_t *p_nal ) { mkv_hnd_t *p_mkv = handle; int sps_size = p_nal[0].i_payload - 4; int pps_size = p_nal[1].i_payload - 4; int sei_size = p_nal[2].i_payload; uint8_t *sps = p_nal[0].p_payload + 4; uint8_t *pps = p_nal[1].p_payload + 4; uint8_t *sei = p_nal[2].p_payload; int ret; uint8_t *avcC; int avcC_len; if( !p_mkv->width || !p_mkv->height || !p_mkv->d_width || !p_mkv->d_height ) return -1; avcC_len = 5 + 1 + 2 + sps_size + 1 + 2 + pps_size; avcC = malloc( avcC_len ); if( !avcC ) return -1; avcC[0] = 1; avcC[1] = sps[1]; avcC[2] = sps[2]; avcC[3] = sps[3]; avcC[4] = 0xff; // nalu size length is four bytes avcC[5] = 0xe1; // one sps avcC[6] = sps_size >> 8; avcC[7] = sps_size; memcpy( avcC+8, sps, sps_size ); avcC[8+sps_size] = 1; // one pps avcC[9+sps_size] = pps_size >> 8; avcC[10+sps_size] = pps_size; memcpy( avcC+11+sps_size, pps, pps_size ); ret = mk_write_header( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC", avcC, avcC_len, p_mkv->frame_duration, 50000, p_mkv->width, p_mkv->height, p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode ); free( avcC ); if( ret < 0 ) return ret; // SEI if( !p_mkv->b_writing_frame ) { if( mk_start_frame( p_mkv->w ) < 0 ) return -1; p_mkv->b_writing_frame = 1; } if( mk_add_frame_data( p_mkv->w, sei, sei_size ) < 0 ) return -1; return sei_size + sps_size + pps_size; } static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture ) { mkv_hnd_t *p_mkv = handle; if( !p_mkv->b_writing_frame ) { if( mk_start_frame( p_mkv->w ) < 0 ) return -1; p_mkv->b_writing_frame = 1; } if( mk_add_frame_data( p_mkv->w, p_nalu, i_size ) < 0 ) return -1; int64_t i_stamp = (int64_t)((p_picture->i_pts * 1e9 * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5); p_mkv->b_writing_frame = 0; if( mk_set_frame_flags( p_mkv->w, i_stamp, p_picture->b_keyframe, p_picture->i_type == X264_TYPE_B ) < 0 ) return -1; return i_size; } static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ) { mkv_hnd_t *p_mkv = handle; int ret; int64_t i_last_delta; i_last_delta = p_mkv->i_timebase_den ? (int64_t)(((largest_pts - second_largest_pts) * p_mkv->i_timebase_num / p_mkv->i_timebase_den) + 0.5) : 0; ret = mk_close( p_mkv->w, i_last_delta ); free( p_mkv ); return ret; } const cli_output_t mkv_output = { open_file, set_param, write_headers, write_frame, close_file }; x264-master/output/matroska_ebml.c000066400000000000000000000324551502133446700174140ustar00rootroot00000000000000/***************************************************************************** * matroska_ebml.c: matroska muxer utilities ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Mike Matsnev * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" #include "matroska_ebml.h" #define CLSIZE 1048576 #define CHECK(x)\ do {\ if( (x) < 0 )\ return -1;\ } while( 0 ) struct mk_context { struct mk_context *next, **prev, *parent; mk_writer *owner; unsigned id; void *data; unsigned d_cur, d_max; }; typedef struct mk_context mk_context; struct mk_writer { FILE *fp; unsigned duration_ptr; mk_context *root, *cluster, *frame; mk_context *freelist; mk_context *actlist; int64_t def_duration; int64_t timescale; int64_t cluster_tc_scaled; int64_t frame_tc, max_frame_tc; int8_t wrote_header, in_frame, keyframe, skippable; }; static mk_context *mk_create_context( mk_writer *w, mk_context *parent, unsigned id ) { mk_context *c; if( w->freelist ) { c = w->freelist; w->freelist = w->freelist->next; } else { c = calloc( 1, sizeof(mk_context) ); if( !c ) return NULL; } c->parent = parent; c->owner = w; c->id = id; if( c->owner->actlist ) c->owner->actlist->prev = &c->next; c->next = c->owner->actlist; c->prev = &c->owner->actlist; c->owner->actlist = c; return c; } static int mk_append_context_data( mk_context *c, const void *data, unsigned size ) { unsigned ns = c->d_cur + size; if( ns > c->d_max ) { void *dp; unsigned dn = c->d_max ? c->d_max << 1 : 16; while( ns > dn ) dn <<= 1; dp = realloc( c->data, dn ); if( !dp ) return -1; c->data = dp; c->d_max = dn; } memcpy( (uint8_t*)c->data + c->d_cur, data, size ); c->d_cur = ns; return 0; } static int mk_write_id( mk_context *c, unsigned id ) { uint8_t c_id[4] = { id >> 24, id >> 16, id >> 8, id }; if( c_id[0] ) return mk_append_context_data( c, c_id, 4 ); if( c_id[1] ) return mk_append_context_data( c, c_id+1, 3 ); if( c_id[2] ) return mk_append_context_data( c, c_id+2, 2 ); return mk_append_context_data( c, c_id+3, 1 ); } static int mk_write_size( mk_context *c, unsigned size ) { uint8_t c_size[5] = { 0x08, size >> 24, size >> 16, size >> 8, size }; if( size < 0x7f ) { c_size[4] |= 0x80; return mk_append_context_data( c, c_size+4, 1 ); } if( size < 0x3fff ) { c_size[3] |= 0x40; return mk_append_context_data( c, c_size+3, 2 ); } if( size < 0x1fffff ) { c_size[2] |= 0x20; return mk_append_context_data( c, c_size+2, 3 ); } if( size < 0x0fffffff ) { c_size[1] |= 0x10; return mk_append_context_data( c, c_size+1, 4 ); } return mk_append_context_data( c, c_size, 5 ); } static int mk_flush_context_id( mk_context *c ) { uint8_t ff = 0xff; if( !c->id ) return 0; CHECK( mk_write_id( c->parent, c->id ) ); CHECK( mk_append_context_data( c->parent, &ff, 1 ) ); c->id = 0; return 0; } static int mk_flush_context_data( mk_context *c ) { if( !c->d_cur ) return 0; if( c->parent ) CHECK( mk_append_context_data( c->parent, c->data, c->d_cur ) ); else if( fwrite( c->data, c->d_cur, 1, c->owner->fp ) != 1 ) return -1; c->d_cur = 0; return 0; } static int mk_close_context( mk_context *c, unsigned *off ) { if( c->id ) { CHECK( mk_write_id( c->parent, c->id ) ); CHECK( mk_write_size( c->parent, c->d_cur ) ); } if( c->parent && off ) *off += c->parent->d_cur; CHECK( mk_flush_context_data( c ) ); if( c->next ) c->next->prev = c->prev; *(c->prev) = c->next; c->next = c->owner->freelist; c->owner->freelist = c; return 0; } static void mk_destroy_contexts( mk_writer *w ) { mk_context *next; for( mk_context *cur = w->freelist; cur; cur = next ) { next = cur->next; free( cur->data ); free( cur ); } for( mk_context *cur = w->actlist; cur; cur = next ) { next = cur->next; free( cur->data ); free( cur ); } w->freelist = w->actlist = w->root = NULL; } static int mk_write_string( mk_context *c, unsigned id, const char *str ) { size_t len = strlen( str ); CHECK( mk_write_id( c, id ) ); CHECK( mk_write_size( c, len ) ); CHECK( mk_append_context_data( c, str, len ) ); return 0; } static int mk_write_bin( mk_context *c, unsigned id, const void *data, unsigned size ) { CHECK( mk_write_id( c, id ) ); CHECK( mk_write_size( c, size ) ); CHECK( mk_append_context_data( c, data, size ) ); return 0; } static int mk_write_uint( mk_context *c, unsigned id, uint64_t ui ) { uint8_t c_ui[8] = { ui >> 56, ui >> 48, ui >> 40, ui >> 32, ui >> 24, ui >> 16, ui >> 8, ui }; unsigned i = 0; CHECK( mk_write_id( c, id ) ); while( i < 7 && !c_ui[i] ) ++i; CHECK( mk_write_size( c, 8 - i ) ); CHECK( mk_append_context_data( c, c_ui+i, 8 - i ) ); return 0; } static int mk_write_float_raw( mk_context *c, float f ) { union { float f; uint32_t u; } u; uint8_t c_f[4]; u.f = f; c_f[0] = u.u >> 24; c_f[1] = u.u >> 16; c_f[2] = u.u >> 8; c_f[3] = u.u; return mk_append_context_data( c, c_f, 4 ); } static int mk_write_float( mk_context *c, unsigned id, float f ) { CHECK( mk_write_id( c, id ) ); CHECK( mk_write_size( c, 4 ) ); CHECK( mk_write_float_raw( c, f ) ); return 0; } mk_writer *mk_create_writer( const char *filename ) { mk_writer *w = calloc( 1, sizeof(mk_writer) ); if( !w ) return NULL; w->root = mk_create_context( w, NULL, 0 ); if( !w->root ) { free( w ); return NULL; } if( !strcmp( filename, "-" ) ) w->fp = stdout; else w->fp = x264_fopen( filename, "wb" ); if( !w->fp ) { mk_destroy_contexts( w ); free( w ); return NULL; } w->timescale = 1000000; return w; } int mk_write_header( mk_writer *w, const char *writing_app, const char *codec_id, const void *codec_private, unsigned codec_private_size, int64_t default_frame_duration, int64_t timescale, unsigned width, unsigned height, unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode ) { mk_context *c, *ti, *v; if( w->wrote_header ) return -1; w->timescale = timescale; w->def_duration = default_frame_duration; if( !(c = mk_create_context( w, w->root, 0x1a45dfa3 )) ) // EBML return -1; CHECK( mk_write_uint( c, 0x4286, 1 ) ); // EBMLVersion CHECK( mk_write_uint( c, 0x42f7, 1 ) ); // EBMLReadVersion CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType CHECK( mk_write_uint( c, 0x4287, stereo_mode >= 0 ? 3 : 2 ) ); // DocTypeVersion CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadVersion CHECK( mk_close_context( c, 0 ) ); if( !(c = mk_create_context( w, w->root, 0x18538067 )) ) // Segment return -1; CHECK( mk_flush_context_id( c ) ); CHECK( mk_close_context( c, 0 ) ); if( !(c = mk_create_context( w, w->root, 0x1549a966 )) ) // SegmentInfo return -1; CHECK( mk_write_string( c, 0x4d80, "Haali Matroska Writer b0" ) ); // MuxingApp CHECK( mk_write_string( c, 0x5741, writing_app ) ); // WritingApp CHECK( mk_write_uint( c, 0x2ad7b1, w->timescale ) ); // TimecodeScale CHECK( mk_write_float( c, 0x4489, 0) ); // Duration w->duration_ptr = c->d_cur - 4; CHECK( mk_close_context( c, &w->duration_ptr ) ); if( !(c = mk_create_context( w, w->root, 0x1654ae6b )) ) // Tracks return -1; if( !(ti = mk_create_context( w, c, 0xae )) ) // TrackEntry return -1; CHECK( mk_write_uint( ti, 0xd7, 1 ) ); // TrackNumber CHECK( mk_write_uint( ti, 0x73c5, 1 ) ); // TrackUID CHECK( mk_write_uint( ti, 0x83, 1 ) ); // TrackType CHECK( mk_write_uint( ti, 0x9c, 0 ) ); // FlagLacing CHECK( mk_write_string( ti, 0x86, codec_id ) ); // CodecID if( codec_private_size ) CHECK( mk_write_bin( ti, 0x63a2, codec_private, codec_private_size ) ); // CodecPrivate if( default_frame_duration ) CHECK( mk_write_uint( ti, 0x23e383, default_frame_duration ) ); // DefaultDuration if( !(v = mk_create_context( w, ti, 0xe0 ) ) ) // Video return -1; CHECK( mk_write_uint( v, 0xb0, width ) ); // PixelWidth CHECK( mk_write_uint( v, 0xba, height ) ); // PixelHeight CHECK( mk_write_uint( v, 0x54b2, display_size_units ) ); // DisplayUnit CHECK( mk_write_uint( v, 0x54b0, d_width ) ); // DisplayWidth CHECK( mk_write_uint( v, 0x54ba, d_height ) ); // DisplayHeight if( stereo_mode >= 0 ) CHECK( mk_write_uint( v, 0x53b8, stereo_mode ) ); // StereoMode CHECK( mk_close_context( v, 0 ) ); CHECK( mk_close_context( ti, 0 ) ); CHECK( mk_close_context( c, 0 ) ); CHECK( mk_flush_context_data( w->root ) ); w->wrote_header = 1; return 0; } static int mk_close_cluster( mk_writer *w ) { if( w->cluster == NULL ) return 0; CHECK( mk_close_context( w->cluster, 0 ) ); w->cluster = NULL; CHECK( mk_flush_context_data( w->root ) ); return 0; } static int mk_flush_frame( mk_writer *w ) { int64_t delta; unsigned fsize; uint8_t c_delta_flags[3]; if( !w->in_frame ) return 0; delta = w->frame_tc/w->timescale - w->cluster_tc_scaled; if( delta > 32767ll || delta < -32768ll ) CHECK( mk_close_cluster( w ) ); if( !w->cluster ) { w->cluster_tc_scaled = w->frame_tc / w->timescale; w->cluster = mk_create_context( w, w->root, 0x1f43b675 ); // Cluster if( !w->cluster ) return -1; CHECK( mk_write_uint( w->cluster, 0xe7, w->cluster_tc_scaled ) ); // Timecode delta = 0; } fsize = w->frame ? w->frame->d_cur : 0; CHECK( mk_write_id( w->cluster, 0xa3 ) ); // SimpleBlock CHECK( mk_write_size( w->cluster, fsize + 4 ) ); // Size CHECK( mk_write_size( w->cluster, 1 ) ); // TrackNumber c_delta_flags[0] = (uint8_t)(delta >> 8); c_delta_flags[1] = (uint8_t)delta; c_delta_flags[2] = (w->keyframe << 7) | w->skippable; CHECK( mk_append_context_data( w->cluster, c_delta_flags, 3 ) ); // Timecode, Flags if( w->frame ) { CHECK( mk_append_context_data( w->cluster, w->frame->data, w->frame->d_cur ) ); // Data w->frame->d_cur = 0; } w->in_frame = 0; if( w->cluster->d_cur > CLSIZE ) CHECK( mk_close_cluster( w ) ); return 0; } int mk_start_frame( mk_writer *w ) { if( mk_flush_frame( w ) < 0 ) return -1; w->in_frame = 1; w->keyframe = 0; w->skippable = 0; return 0; } int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable ) { if( !w->in_frame ) return -1; w->frame_tc = timestamp; w->keyframe = keyframe != 0; w->skippable = skippable != 0; if( w->max_frame_tc < timestamp ) w->max_frame_tc = timestamp; return 0; } int mk_add_frame_data( mk_writer *w, const void *data, unsigned size ) { if( !w->in_frame ) return -1; if( !w->frame ) if( !(w->frame = mk_create_context( w, NULL, 0 )) ) return -1; return mk_append_context_data( w->frame, data, size ); } int mk_close( mk_writer *w, int64_t last_delta ) { int ret = 0; if( mk_flush_frame( w ) < 0 || mk_close_cluster( w ) < 0 ) ret = -1; if( w->wrote_header && x264_is_regular_file( w->fp ) ) { int64_t last_frametime = w->def_duration ? w->def_duration : last_delta; int64_t total_duration = w->max_frame_tc + last_frametime; if( fseek( w->fp, w->duration_ptr, SEEK_SET ) || mk_write_float_raw( w->root, (float)((double)total_duration / w->timescale) ) < 0 || mk_flush_context_data( w->root ) < 0 ) ret = -1; } mk_destroy_contexts( w ); fclose( w->fp ); free( w ); return ret; } x264-master/output/matroska_ebml.h000066400000000000000000000042431502133446700174130ustar00rootroot00000000000000/***************************************************************************** * matroska_ebml.h: matroska muxer utilities ***************************************************************************** * Copyright (C) 2005-2025 x264 project * * Authors: Mike Matsnev * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_MATROSKA_EBML_H #define X264_MATROSKA_EBML_H /* Matroska display size units from the spec */ #define DS_PIXELS 0 #define DS_CM 1 #define DS_INCHES 2 #define DS_ASPECT_RATIO 3 typedef struct mk_writer mk_writer; mk_writer *mk_create_writer( const char *filename ); int mk_write_header( mk_writer *w, const char *writing_app, const char *codec_id, const void *codec_private, unsigned codec_private_size, int64_t default_frame_duration, int64_t timescale, unsigned width, unsigned height, unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode ); int mk_start_frame( mk_writer *w ); int mk_add_frame_data( mk_writer *w, const void *data, unsigned size ); int mk_set_frame_flags( mk_writer *w, int64_t timestamp, int keyframe, int skippable ); int mk_close( mk_writer *w, int64_t last_delta ); #endif x264-master/output/mp4.c000066400000000000000000000303221502133446700152630ustar00rootroot00000000000000/***************************************************************************** * mp4.c: mp4 muxer ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" #include typedef struct { GF_ISOFile *p_file; GF_AVCConfig *p_config; GF_ISOSample *p_sample; int i_track; uint32_t i_descidx; uint64_t i_time_res; int64_t i_time_inc; int64_t i_delay_time; int64_t i_init_delta; int i_numframe; int i_delay_frames; int b_dts_compress; int i_dts_compress_multiplier; int i_data_size; } mp4_hnd_t; static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track ) { u32 count, di, timescale, time_wnd, rate; u64 offset; Double br; GF_ESD *esd; esd = gf_isom_get_esd( p_file, i_track, 1 ); if( !esd ) return; esd->decoderConfig->avgBitrate = 0; esd->decoderConfig->maxBitrate = 0; rate = time_wnd = 0; timescale = gf_isom_get_media_timescale( p_file, i_track ); count = gf_isom_get_sample_count( p_file, i_track ); for( u32 i = 0; i < count; i++ ) { GF_ISOSample *samp = gf_isom_get_sample_info( p_file, i_track, i+1, &di, &offset ); if( !samp ) { x264_cli_log( "mp4", X264_LOG_ERROR, "failure reading back frame %u\n", i ); break; } if( esd->decoderConfig->bufferSizeDB < samp->dataLength ) esd->decoderConfig->bufferSizeDB = samp->dataLength; esd->decoderConfig->avgBitrate += samp->dataLength; rate += samp->dataLength; if( samp->DTS > time_wnd + timescale ) { if( rate > esd->decoderConfig->maxBitrate ) esd->decoderConfig->maxBitrate = rate; time_wnd = samp->DTS; rate = 0; } gf_isom_sample_del( &samp ); } br = (Double)(s64)gf_isom_get_media_duration( p_file, i_track ); br /= timescale; esd->decoderConfig->avgBitrate = (u32)(esd->decoderConfig->avgBitrate / br); /*move to bps*/ esd->decoderConfig->avgBitrate *= 8; esd->decoderConfig->maxBitrate *= 8; gf_isom_change_mpeg4_description( p_file, i_track, 1, esd ); gf_odf_desc_del( (GF_Descriptor*)esd ); } static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ) { mp4_hnd_t *p_mp4 = handle; if( !p_mp4 ) return 0; if( p_mp4->p_config ) gf_odf_avc_cfg_del( p_mp4->p_config ); if( p_mp4->p_sample ) { if( p_mp4->p_sample->data ) free( p_mp4->p_sample->data ); p_mp4->p_sample->dataLength = 0; gf_isom_sample_del( &p_mp4->p_sample ); } if( p_mp4->p_file ) { if( p_mp4->i_track ) { /* The mdhd duration is defined as CTS[final] - CTS[0] + duration of last frame. * The mdhd duration (in seconds) should be able to be longer than the tkhd duration since the track is managed by edts. * So, if mdhd duration is equal to the last DTS or less, we give the last composition time delta to the last sample duration. * And then, the mdhd duration is updated, but it time-wise doesn't give the actual duration. * The tkhd duration is the actual track duration. */ uint64_t mdhd_duration = (2 * largest_pts - second_largest_pts) * p_mp4->i_time_inc; if( mdhd_duration != gf_isom_get_media_duration( p_mp4->p_file, p_mp4->i_track ) ) { uint64_t last_dts = gf_isom_get_sample_dts( p_mp4->p_file, p_mp4->i_track, p_mp4->i_numframe ); uint32_t last_duration = (uint32_t)( mdhd_duration > last_dts ? mdhd_duration - last_dts : (largest_pts - second_largest_pts) * p_mp4->i_time_inc ); gf_isom_set_last_sample_duration( p_mp4->p_file, p_mp4->i_track, last_duration ); } /* Write an Edit Box if the first CTS offset is positive. * A media_time is given by not the mvhd timescale but rather the mdhd timescale. * The reason is that an Edit Box maps the presentation time-line to the media time-line. * Any demuxers should follow the Edit Box if it exists. */ GF_ISOSample *sample = gf_isom_get_sample_info( p_mp4->p_file, p_mp4->i_track, 1, NULL, NULL ); if( sample && sample->CTS_Offset > 0 ) { uint32_t mvhd_timescale = gf_isom_get_timescale( p_mp4->p_file ); uint64_t tkhd_duration = (uint64_t)( mdhd_duration * ( (double)mvhd_timescale / p_mp4->i_time_res ) ); #if GPAC_VERSION_MAJOR > 8 gf_isom_append_edit( p_mp4->p_file, p_mp4->i_track, tkhd_duration, sample->CTS_Offset, GF_ISOM_EDIT_NORMAL ); #else gf_isom_append_edit_segment( p_mp4->p_file, p_mp4->i_track, tkhd_duration, sample->CTS_Offset, GF_ISOM_EDIT_NORMAL ); #endif } gf_isom_sample_del( &sample ); recompute_bitrate_mp4( p_mp4->p_file, p_mp4->i_track ); } gf_isom_set_pl_indication( p_mp4->p_file, GF_ISOM_PL_VISUAL, 0x15 ); gf_isom_set_storage_mode( p_mp4->p_file, GF_ISOM_STORE_FLAT ); gf_isom_close( p_mp4->p_file ); } free( p_mp4 ); return 0; } static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ) { *p_handle = NULL; FILE *fh = x264_fopen( psz_filename, "w" ); if( !fh ) return -1; int b_regular = x264_is_regular_file( fh ); fclose( fh ); FAIL_IF_ERR( !b_regular, "mp4", "MP4 output is incompatible with non-regular file `%s'\n", psz_filename ); mp4_hnd_t *p_mp4 = calloc( 1, sizeof(mp4_hnd_t) ); if( !p_mp4 ) return -1; p_mp4->p_file = gf_isom_open( psz_filename, GF_ISOM_OPEN_WRITE, NULL ); p_mp4->b_dts_compress = opt->use_dts_compress; if( !(p_mp4->p_sample = gf_isom_sample_new()) ) { close_file( p_mp4, 0, 0 ); return -1; } gf_isom_set_brand_info( p_mp4->p_file, GF_ISOM_BRAND_AVC1, 0 ); *p_handle = p_mp4; return 0; } static int set_param( hnd_t handle, x264_param_t *p_param ) { mp4_hnd_t *p_mp4 = handle; p_mp4->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0; p_mp4->i_dts_compress_multiplier = p_mp4->b_dts_compress * p_mp4->i_delay_frames + 1; p_mp4->i_time_res = (uint64_t)p_param->i_timebase_den * p_mp4->i_dts_compress_multiplier; p_mp4->i_time_inc = (uint64_t)p_param->i_timebase_num * p_mp4->i_dts_compress_multiplier; FAIL_IF_ERR( p_mp4->i_time_res > UINT32_MAX, "mp4", "MP4 media timescale %"PRIu64" exceeds maximum\n", p_mp4->i_time_res ); p_mp4->i_track = gf_isom_new_track( p_mp4->p_file, 0, GF_ISOM_MEDIA_VISUAL, p_mp4->i_time_res ); p_mp4->p_config = gf_odf_avc_cfg_new(); gf_isom_avc_config_new( p_mp4->p_file, p_mp4->i_track, p_mp4->p_config, NULL, NULL, &p_mp4->i_descidx ); gf_isom_set_track_enabled( p_mp4->p_file, p_mp4->i_track, 1 ); gf_isom_set_visual_info( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->i_width, p_param->i_height ); if( p_param->vui.i_sar_width && p_param->vui.i_sar_height ) { uint64_t dw = p_param->i_width << 16; uint64_t dh = p_param->i_height << 16; double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height; if( sar > 1.0 ) dw *= sar; else dh /= sar; gf_isom_set_pixel_aspect_ratio( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_param->vui.i_sar_width, p_param->vui.i_sar_height, 0 ); gf_isom_set_track_layout_info( p_mp4->p_file, p_mp4->i_track, dw, dh, 0, 0, 0 ); } p_mp4->i_data_size = p_param->i_width * p_param->i_height * 3 / 2; p_mp4->p_sample->data = malloc( p_mp4->i_data_size ); if( !p_mp4->p_sample->data ) { p_mp4->i_data_size = 0; return -1; } return 0; } static int check_buffer( mp4_hnd_t *p_mp4, int needed_size ) { if( needed_size > p_mp4->i_data_size ) { void *ptr = realloc( p_mp4->p_sample->data, needed_size ); if( !ptr ) return -1; p_mp4->p_sample->data = ptr; p_mp4->i_data_size = needed_size; } return 0; } static int write_headers( hnd_t handle, x264_nal_t *p_nal ) { mp4_hnd_t *p_mp4 = handle; GF_AVCConfigSlot *p_slot; int sps_size = p_nal[0].i_payload - 4; int pps_size = p_nal[1].i_payload - 4; int sei_size = p_nal[2].i_payload; uint8_t *sps = p_nal[0].p_payload + 4; uint8_t *pps = p_nal[1].p_payload + 4; uint8_t *sei = p_nal[2].p_payload; // SPS p_mp4->p_config->configurationVersion = 1; p_mp4->p_config->AVCProfileIndication = sps[1]; p_mp4->p_config->profile_compatibility = sps[2]; p_mp4->p_config->AVCLevelIndication = sps[3]; p_slot = malloc( sizeof(GF_AVCConfigSlot) ); if( !p_slot ) return -1; p_slot->size = sps_size; p_slot->data = malloc( p_slot->size ); if( !p_slot->data ) return -1; memcpy( p_slot->data, sps, sps_size ); gf_list_add( p_mp4->p_config->sequenceParameterSets, p_slot ); // PPS p_slot = malloc( sizeof(GF_AVCConfigSlot) ); if( !p_slot ) return -1; p_slot->size = pps_size; p_slot->data = malloc( p_slot->size ); if( !p_slot->data ) return -1; memcpy( p_slot->data, pps, pps_size ); gf_list_add( p_mp4->p_config->pictureParameterSets, p_slot ); gf_isom_avc_config_update( p_mp4->p_file, p_mp4->i_track, 1, p_mp4->p_config ); // SEI if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + sei_size ) ) return -1; memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, sei, sei_size ); p_mp4->p_sample->dataLength += sei_size; return sei_size + sps_size + pps_size; } static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture ) { mp4_hnd_t *p_mp4 = handle; int64_t dts; int64_t cts; if( check_buffer( p_mp4, p_mp4->p_sample->dataLength + i_size ) ) return -1; memcpy( p_mp4->p_sample->data + p_mp4->p_sample->dataLength, p_nalu, i_size ); p_mp4->p_sample->dataLength += i_size; if( !p_mp4->i_numframe ) p_mp4->i_delay_time = p_picture->i_dts * -1; if( p_mp4->b_dts_compress ) { if( p_mp4->i_numframe == 1 ) p_mp4->i_init_delta = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc; dts = p_mp4->i_numframe > p_mp4->i_delay_frames ? p_picture->i_dts * p_mp4->i_time_inc : p_mp4->i_numframe * (p_mp4->i_init_delta / p_mp4->i_dts_compress_multiplier); cts = p_picture->i_pts * p_mp4->i_time_inc; } else { dts = (p_picture->i_dts + p_mp4->i_delay_time) * p_mp4->i_time_inc; cts = (p_picture->i_pts + p_mp4->i_delay_time) * p_mp4->i_time_inc; } p_mp4->p_sample->IsRAP = p_picture->b_keyframe; p_mp4->p_sample->DTS = dts; p_mp4->p_sample->CTS_Offset = (uint32_t)(cts - dts); gf_isom_add_sample( p_mp4->p_file, p_mp4->i_track, p_mp4->i_descidx, p_mp4->p_sample ); p_mp4->p_sample->dataLength = 0; p_mp4->i_numframe++; return i_size; } const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file }; x264-master/output/mp4_lsmash.c000066400000000000000000000417661502133446700166500ustar00rootroot00000000000000/***************************************************************************** * mp4_lsmash.c: mp4 muxer using L-SMASH ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * Yusuke Nakamura * Takashi Hirata * golgol7777 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" #include #define H264_NALU_LENGTH_SIZE 4 /*******************/ #define MP4_LOG_ERROR( ... ) x264_cli_log( "mp4", X264_LOG_ERROR, __VA_ARGS__ ) #define MP4_LOG_WARNING( ... ) x264_cli_log( "mp4", X264_LOG_WARNING, __VA_ARGS__ ) #define MP4_LOG_INFO( ... ) x264_cli_log( "mp4", X264_LOG_INFO, __VA_ARGS__ ) #define MP4_FAIL_IF_ERR( cond, ... ) FAIL_IF_ERR( cond, "mp4", __VA_ARGS__ ) /* For close_file() */ #define MP4_LOG_IF_ERR( cond, ... )\ do\ {\ if( cond )\ {\ MP4_LOG_ERROR( __VA_ARGS__ );\ }\ } while( 0 ) /* For open_file() */ #define MP4_FAIL_IF_ERR_EX( cond, ... )\ do\ {\ if( cond )\ {\ remove_mp4_hnd( p_mp4 );\ MP4_LOG_ERROR( __VA_ARGS__ );\ return -1;\ }\ } while( 0 ) /*******************/ typedef struct { lsmash_root_t *p_root; lsmash_video_summary_t *summary; int b_stdout; uint32_t i_movie_timescale; uint32_t i_video_timescale; uint32_t i_track; uint32_t i_sample_entry; uint64_t i_time_inc; int64_t i_start_offset; uint64_t i_first_cts; uint64_t i_prev_dts; uint32_t i_sei_size; uint8_t *p_sei_buffer; int i_numframe; int64_t i_init_delta; int i_delay_frames; int b_dts_compress; int i_dts_compress_multiplier; int b_use_recovery; int b_fragments; lsmash_file_parameters_t file_param; } mp4_hnd_t; /*******************/ static void remove_mp4_hnd( hnd_t handle ) { mp4_hnd_t *p_mp4 = handle; if( !p_mp4 ) return; lsmash_cleanup_summary( (lsmash_summary_t *)p_mp4->summary ); lsmash_close_file( &p_mp4->file_param ); lsmash_destroy_root( p_mp4->p_root ); free( p_mp4->p_sei_buffer ); free( p_mp4 ); } /*******************/ static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ) { mp4_hnd_t *p_mp4 = handle; if( !p_mp4 ) return 0; if( p_mp4->p_root ) { double actual_duration = 0; if( p_mp4->i_track ) { /* Flush the rest of samples and add the last sample_delta. */ uint32_t last_delta = largest_pts - second_largest_pts; MP4_LOG_IF_ERR( lsmash_flush_pooled_samples( p_mp4->p_root, p_mp4->i_track, (last_delta ? last_delta : 1) * p_mp4->i_time_inc ), "failed to flush the rest of samples.\n" ); if( p_mp4->i_movie_timescale != 0 && p_mp4->i_video_timescale != 0 ) /* avoid zero division */ actual_duration = ((double)((largest_pts + last_delta) * p_mp4->i_time_inc) / p_mp4->i_video_timescale) * p_mp4->i_movie_timescale; else MP4_LOG_ERROR( "timescale is broken.\n" ); /* * Declare the explicit time-line mapping. * A segment_duration is given by movie timescale, while a media_time that is the start time of this segment * is given by not the movie timescale but rather the media timescale. * The reason is that ISO media have two time-lines, presentation and media time-line, * and an edit maps the presentation time-line to the media time-line. * According to QuickTime file format specification and the actual playback in QuickTime Player, * if the Edit Box doesn't exist in the track, the ratio of the summation of sample durations and track's duration becomes * the track's media_rate so that the entire media can be used by the track. * So, we add Edit Box here to avoid this implicit media_rate could distort track's presentation timestamps slightly. * Note: Any demuxers should follow the Edit List Box if it exists. */ lsmash_edit_t edit; edit.duration = actual_duration; edit.start_time = p_mp4->i_first_cts; edit.rate = ISOM_EDIT_MODE_NORMAL; if( !p_mp4->b_fragments ) { MP4_LOG_IF_ERR( lsmash_create_explicit_timeline_map( p_mp4->p_root, p_mp4->i_track, edit ), "failed to set timeline map for video.\n" ); } else if( !p_mp4->b_stdout ) MP4_LOG_IF_ERR( lsmash_modify_explicit_timeline_map( p_mp4->p_root, p_mp4->i_track, 1, edit ), "failed to update timeline map for video.\n" ); } MP4_LOG_IF_ERR( lsmash_finish_movie( p_mp4->p_root, NULL ), "failed to finish movie.\n" ); } remove_mp4_hnd( p_mp4 ); /* including lsmash_destroy_root( p_mp4->p_root ); */ return 0; } static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ) { *p_handle = NULL; int b_regular = strcmp( psz_filename, "-" ); b_regular = b_regular && x264_is_regular_file_path( psz_filename ); if( b_regular ) { FILE *fh = x264_fopen( psz_filename, "wb" ); MP4_FAIL_IF_ERR( !fh, "cannot open output file `%s'.\n", psz_filename ); b_regular = x264_is_regular_file( fh ); fclose( fh ); } mp4_hnd_t *p_mp4 = calloc( 1, sizeof(mp4_hnd_t) ); MP4_FAIL_IF_ERR( !p_mp4, "failed to allocate memory for muxer information.\n" ); p_mp4->b_dts_compress = opt->use_dts_compress; p_mp4->b_use_recovery = 0; // we don't really support recovery p_mp4->b_fragments = !b_regular; p_mp4->b_stdout = !strcmp( psz_filename, "-" ); p_mp4->p_root = lsmash_create_root(); MP4_FAIL_IF_ERR_EX( !p_mp4->p_root, "failed to create root.\n" ); MP4_FAIL_IF_ERR_EX( lsmash_open_file( psz_filename, 0, &p_mp4->file_param ) < 0, "failed to open an output file.\n" ); if( p_mp4->b_fragments ) p_mp4->file_param.mode |= LSMASH_FILE_MODE_FRAGMENTED; p_mp4->summary = (lsmash_video_summary_t *)lsmash_create_summary( LSMASH_SUMMARY_TYPE_VIDEO ); MP4_FAIL_IF_ERR_EX( !p_mp4->summary, "failed to allocate memory for summary information of video.\n" ); p_mp4->summary->sample_type = ISOM_CODEC_TYPE_AVC1_VIDEO; *p_handle = p_mp4; return 0; } static int set_param( hnd_t handle, x264_param_t *p_param ) { mp4_hnd_t *p_mp4 = handle; uint64_t i_media_timescale; p_mp4->i_delay_frames = p_param->i_bframe ? (p_param->i_bframe_pyramid ? 2 : 1) : 0; p_mp4->i_dts_compress_multiplier = p_mp4->b_dts_compress * p_mp4->i_delay_frames + 1; i_media_timescale = (uint64_t)p_param->i_timebase_den * p_mp4->i_dts_compress_multiplier; p_mp4->i_time_inc = (uint64_t)p_param->i_timebase_num * p_mp4->i_dts_compress_multiplier; MP4_FAIL_IF_ERR( i_media_timescale > UINT32_MAX, "MP4 media timescale %"PRIu64" exceeds maximum\n", i_media_timescale ); /* Select brands. */ lsmash_brand_type brands[6] = { 0 }; uint32_t brand_count = 0; brands[brand_count++] = ISOM_BRAND_TYPE_MP42; brands[brand_count++] = ISOM_BRAND_TYPE_MP41; brands[brand_count++] = ISOM_BRAND_TYPE_ISOM; if( p_mp4->b_use_recovery ) { brands[brand_count++] = ISOM_BRAND_TYPE_AVC1; /* sdtp, sgpd, sbgp and visual roll recovery grouping */ if( p_param->b_open_gop ) brands[brand_count++] = ISOM_BRAND_TYPE_ISO6; /* cslg and visual random access grouping */ } /* Set file */ lsmash_file_parameters_t *file_param = &p_mp4->file_param; file_param->major_brand = brands[0]; file_param->brands = brands; file_param->brand_count = brand_count; file_param->minor_version = 0; MP4_FAIL_IF_ERR( !lsmash_set_file( p_mp4->p_root, file_param ), "failed to add an output file into a ROOT.\n" ); /* Set movie parameters. */ lsmash_movie_parameters_t movie_param; lsmash_initialize_movie_parameters( &movie_param ); MP4_FAIL_IF_ERR( lsmash_set_movie_parameters( p_mp4->p_root, &movie_param ), "failed to set movie parameters.\n" ); p_mp4->i_movie_timescale = lsmash_get_movie_timescale( p_mp4->p_root ); MP4_FAIL_IF_ERR( !p_mp4->i_movie_timescale, "movie timescale is broken.\n" ); /* Create a video track. */ p_mp4->i_track = lsmash_create_track( p_mp4->p_root, ISOM_MEDIA_HANDLER_TYPE_VIDEO_TRACK ); MP4_FAIL_IF_ERR( !p_mp4->i_track, "failed to create a video track.\n" ); p_mp4->summary->width = p_param->i_width; p_mp4->summary->height = p_param->i_height; uint32_t i_display_width = p_param->i_width << 16; uint32_t i_display_height = p_param->i_height << 16; if( p_param->vui.i_sar_width && p_param->vui.i_sar_height ) { double sar = (double)p_param->vui.i_sar_width / p_param->vui.i_sar_height; if( sar > 1.0 ) i_display_width *= sar; else i_display_height /= sar; p_mp4->summary->par_h = p_param->vui.i_sar_width; p_mp4->summary->par_v = p_param->vui.i_sar_height; } p_mp4->summary->color.primaries_index = p_param->vui.i_colorprim; p_mp4->summary->color.transfer_index = p_param->vui.i_transfer; p_mp4->summary->color.matrix_index = p_param->vui.i_colmatrix >= 0 ? p_param->vui.i_colmatrix : ISOM_MATRIX_INDEX_UNSPECIFIED; p_mp4->summary->color.full_range = p_param->vui.b_fullrange >= 0 ? p_param->vui.b_fullrange : 0; /* Set video track parameters. */ lsmash_track_parameters_t track_param; lsmash_initialize_track_parameters( &track_param ); lsmash_track_mode track_mode = ISOM_TRACK_ENABLED | ISOM_TRACK_IN_MOVIE | ISOM_TRACK_IN_PREVIEW; track_param.mode = track_mode; track_param.display_width = i_display_width; track_param.display_height = i_display_height; MP4_FAIL_IF_ERR( lsmash_set_track_parameters( p_mp4->p_root, p_mp4->i_track, &track_param ), "failed to set track parameters for video.\n" ); /* Set video media parameters. */ lsmash_media_parameters_t media_param; lsmash_initialize_media_parameters( &media_param ); media_param.timescale = i_media_timescale; media_param.media_handler_name = "L-SMASH Video Media Handler"; if( p_mp4->b_use_recovery ) { media_param.roll_grouping = p_param->b_intra_refresh; media_param.rap_grouping = p_param->b_open_gop; } MP4_FAIL_IF_ERR( lsmash_set_media_parameters( p_mp4->p_root, p_mp4->i_track, &media_param ), "failed to set media parameters for video.\n" ); p_mp4->i_video_timescale = lsmash_get_media_timescale( p_mp4->p_root, p_mp4->i_track ); MP4_FAIL_IF_ERR( !p_mp4->i_video_timescale, "media timescale for video is broken.\n" ); return 0; } static int write_headers( hnd_t handle, x264_nal_t *p_nal ) { mp4_hnd_t *p_mp4 = handle; uint32_t sps_size = p_nal[0].i_payload - H264_NALU_LENGTH_SIZE; uint32_t pps_size = p_nal[1].i_payload - H264_NALU_LENGTH_SIZE; uint32_t sei_size = p_nal[2].i_payload; uint8_t *sps = p_nal[0].p_payload + H264_NALU_LENGTH_SIZE; uint8_t *pps = p_nal[1].p_payload + H264_NALU_LENGTH_SIZE; uint8_t *sei = p_nal[2].p_payload; lsmash_codec_specific_t *cs = lsmash_create_codec_specific_data( LSMASH_CODEC_SPECIFIC_DATA_TYPE_ISOM_VIDEO_H264, LSMASH_CODEC_SPECIFIC_FORMAT_STRUCTURED ); lsmash_h264_specific_parameters_t *param = (lsmash_h264_specific_parameters_t *)cs->data.structured; param->lengthSizeMinusOne = H264_NALU_LENGTH_SIZE - 1; /* SPS * The remaining parameters are automatically set by SPS. */ if( lsmash_append_h264_parameter_set( param, H264_PARAMETER_SET_TYPE_SPS, sps, sps_size ) ) { MP4_LOG_ERROR( "failed to append SPS.\n" ); return -1; } /* PPS */ if( lsmash_append_h264_parameter_set( param, H264_PARAMETER_SET_TYPE_PPS, pps, pps_size ) ) { MP4_LOG_ERROR( "failed to append PPS.\n" ); return -1; } if( lsmash_add_codec_specific_data( (lsmash_summary_t *)p_mp4->summary, cs ) ) { MP4_LOG_ERROR( "failed to add H.264 specific info.\n" ); return -1; } lsmash_destroy_codec_specific_data( cs ); /* Additional extensions */ /* Bitrate info */ cs = lsmash_create_codec_specific_data( LSMASH_CODEC_SPECIFIC_DATA_TYPE_ISOM_VIDEO_H264_BITRATE, LSMASH_CODEC_SPECIFIC_FORMAT_STRUCTURED ); if( cs ) lsmash_add_codec_specific_data( (lsmash_summary_t *)p_mp4->summary, cs ); lsmash_destroy_codec_specific_data( cs ); p_mp4->i_sample_entry = lsmash_add_sample_entry( p_mp4->p_root, p_mp4->i_track, p_mp4->summary ); MP4_FAIL_IF_ERR( !p_mp4->i_sample_entry, "failed to add sample entry for video.\n" ); /* SEI */ p_mp4->p_sei_buffer = malloc( sei_size ); MP4_FAIL_IF_ERR( !p_mp4->p_sei_buffer, "failed to allocate sei transition buffer.\n" ); memcpy( p_mp4->p_sei_buffer, sei, sei_size ); p_mp4->i_sei_size = sei_size; return sei_size + sps_size + pps_size; } static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture ) { mp4_hnd_t *p_mp4 = handle; uint64_t dts, cts; if( !p_mp4->i_numframe ) { p_mp4->i_start_offset = p_picture->i_dts * -1; p_mp4->i_first_cts = p_mp4->b_dts_compress ? 0 : p_mp4->i_start_offset * p_mp4->i_time_inc; if( p_mp4->b_fragments ) { lsmash_edit_t edit; edit.duration = ISOM_EDIT_DURATION_UNKNOWN32; /* QuickTime doesn't support 64bit duration. */ edit.start_time = p_mp4->i_first_cts; edit.rate = ISOM_EDIT_MODE_NORMAL; MP4_LOG_IF_ERR( lsmash_create_explicit_timeline_map( p_mp4->p_root, p_mp4->i_track, edit ), "failed to set timeline map for video.\n" ); } } lsmash_sample_t *p_sample = lsmash_create_sample( i_size + p_mp4->i_sei_size ); MP4_FAIL_IF_ERR( !p_sample, "failed to create a video sample data.\n" ); if( p_mp4->p_sei_buffer ) { memcpy( p_sample->data, p_mp4->p_sei_buffer, p_mp4->i_sei_size ); free( p_mp4->p_sei_buffer ); p_mp4->p_sei_buffer = NULL; } memcpy( p_sample->data + p_mp4->i_sei_size, p_nalu, i_size ); p_mp4->i_sei_size = 0; if( p_mp4->b_dts_compress ) { if( p_mp4->i_numframe == 1 ) p_mp4->i_init_delta = (p_picture->i_dts + p_mp4->i_start_offset) * p_mp4->i_time_inc; dts = p_mp4->i_numframe > p_mp4->i_delay_frames ? p_picture->i_dts * p_mp4->i_time_inc : p_mp4->i_numframe * (p_mp4->i_init_delta / p_mp4->i_dts_compress_multiplier); cts = p_picture->i_pts * p_mp4->i_time_inc; } else { dts = (p_picture->i_dts + p_mp4->i_start_offset) * p_mp4->i_time_inc; cts = (p_picture->i_pts + p_mp4->i_start_offset) * p_mp4->i_time_inc; } p_sample->dts = dts; p_sample->cts = cts; p_sample->index = p_mp4->i_sample_entry; p_sample->prop.ra_flags = p_picture->b_keyframe ? ISOM_SAMPLE_RANDOM_ACCESS_FLAG_SYNC : ISOM_SAMPLE_RANDOM_ACCESS_FLAG_NONE; if( p_mp4->b_fragments && p_mp4->i_numframe && p_sample->prop.ra_flags != ISOM_SAMPLE_RANDOM_ACCESS_FLAG_NONE ) { MP4_FAIL_IF_ERR( lsmash_flush_pooled_samples( p_mp4->p_root, p_mp4->i_track, p_sample->dts - p_mp4->i_prev_dts ), "failed to flush the rest of samples.\n" ); MP4_FAIL_IF_ERR( lsmash_create_fragment_movie( p_mp4->p_root ), "failed to create a movie fragment.\n" ); } /* Append data per sample. */ MP4_FAIL_IF_ERR( lsmash_append_sample( p_mp4->p_root, p_mp4->i_track, p_sample ), "failed to append a video frame.\n" ); p_mp4->i_prev_dts = dts; p_mp4->i_numframe++; return i_size; } const cli_output_t mp4_output = { open_file, set_param, write_headers, write_frame, close_file }; x264-master/output/output.h000066400000000000000000000037011502133446700161310ustar00rootroot00000000000000/***************************************************************************** * output.h: x264 file output modules ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifndef X264_OUTPUT_H #define X264_OUTPUT_H #include "x264cli.h" typedef struct { int use_dts_compress; } cli_output_opt_t; typedef struct { int (*open_file)( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ); int (*set_param)( hnd_t handle, x264_param_t *p_param ); int (*write_headers)( hnd_t handle, x264_nal_t *p_nal ); int (*write_frame)( hnd_t handle, uint8_t *p_nal, int i_size, x264_picture_t *p_picture ); int (*close_file)( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ); } cli_output_t; extern const cli_output_t raw_output; extern const cli_output_t mkv_output; extern const cli_output_t mp4_output; extern const cli_output_t flv_output; #endif x264-master/output/raw.c000066400000000000000000000044231502133446700153570ustar00rootroot00000000000000/***************************************************************************** * raw.c: raw muxer ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Laurent Aimar * Loren Merritt * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "output.h" static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ) { if( !strcmp( psz_filename, "-" ) ) *p_handle = stdout; else if( !(*p_handle = x264_fopen( psz_filename, "w+b" )) ) return -1; return 0; } static int set_param( hnd_t handle, x264_param_t *p_param ) { return 0; } static int write_headers( hnd_t handle, x264_nal_t *p_nal ) { int size = p_nal[0].i_payload + p_nal[1].i_payload + p_nal[2].i_payload; if( fwrite( p_nal[0].p_payload, size, 1, (FILE*)handle ) ) return size; return -1; } static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_t *p_picture ) { if( fwrite( p_nalu, i_size, 1, (FILE*)handle ) ) return i_size; return -1; } static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ) { if( !handle || handle == stdout ) return 0; return fclose( (FILE*)handle ); } const cli_output_t raw_output = { open_file, set_param, write_headers, write_frame, close_file }; x264-master/tools/000077500000000000000000000000001502133446700142175ustar00rootroot00000000000000x264-master/tools/bash-autocomplete.sh000066400000000000000000000007131502133446700201700ustar00rootroot00000000000000_x264() { local path args cur prev path="${COMP_LINE%%[[:blank:]]*}" args="${COMP_LINE:${#path}:$((COMP_POINT-${#path}))}" cur="${args##*[[:blank:]=]}" prev="$(sed 's/[[:blank:]=]*$//; s/^.*[[:blank:]]//' <<< "${args%%"$cur"}")" # Expand ~ printf -v path '%q' "$path" && eval path="${path/#'\~'/'~'}" COMPREPLY=($("$path" --autocomplete "$prev" "$cur")) && compopt +o default } 2>/dev/null complete -o default -F _x264 x264 x264-master/tools/checkasm-a.asm000066400000000000000000000137341502133446700167250ustar00rootroot00000000000000;***************************************************************************** ;* checkasm-a.asm: assembly check tool ;***************************************************************************** ;* Copyright (C) 2008-2025 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;* ;* This program is also available under a commercial proprietary license. ;* For more information, contact us at licensing@x264.com. ;***************************************************************************** %include "x86inc.asm" SECTION_RODATA error_message: db "failed to preserve register", 0 %if ARCH_X86_64 ; just random numbers to reduce the chance of incidental match ALIGN 16 x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064 x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636 x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e x9: dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9 x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786 x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5 n7: dq 0x21f86d66c8ca00ce n8: dq 0x75b6ba21077c48ad n9: dq 0xed56bb2dcb3c7736 n10: dq 0x8bda43d3fd1a7e06 n11: dq 0xb64a9c9e5d318408 n12: dq 0xdf9a54b303f1d3a3 n13: dq 0x4a75479abd64e097 n14: dq 0x249214109d5d1c88 %endif SECTION .text cextern_naked puts ; max number of args used by any x264 asm function. %define max_args 15 %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void x264_checkasm_stack_clobber( uint64_t clobber, ... ) ;----------------------------------------------------------------------------- cglobal checkasm_stack_clobber, 1,2 ; Clobber the stack with junk below the stack pointer %define argsize (max_args+6)*8 SUB rsp, argsize mov r1, argsize-8 .loop: mov [rsp+r1], r0 sub r1, 8 jge .loop ADD rsp, argsize RET %if WIN64 %assign free_regs 7 %else %assign free_regs 9 %endif ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- INIT_XMM cglobal checkasm_call, 2,15,16,-1*(((max_args+1)*8+STACK_ALIGNMENT-1) & ~(STACK_ALIGNMENT-1)) mov r6, r0 mov [rsp+max_args*8], r1 ; All arguments have been pushed on the stack instead of registers in order to ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit. mov r0, r6mp mov r1, r7mp mov r2, r8mp mov r3, r9mp %if UNIX64 mov r4, r10mp mov r5, r11mp %assign i 6 %rep max_args-6 mov r9, [rstk+stack_offset+(i+1)*8] mov [rsp+(i-6)*8], r9 %assign i i+1 %endrep %else %assign i 4 %rep max_args-4 mov r9, [rstk+stack_offset+(i+7)*8] mov [rsp+i*8], r9 %assign i i+1 %endrep %endif %if WIN64 %assign i 6 %rep 16-6 mova m %+ i, [x %+ i] %assign i i+1 %endrep %endif %assign i 14 %rep 15-free_regs mov r %+ i, [n %+ i] %assign i i-1 %endrep call r6 %assign i 14 %rep 15-free_regs xor r %+ i, [n %+ i] or r14, r %+ i %assign i i-1 %endrep %if WIN64 %assign i 6 %rep 16-6 pxor m %+ i, [x %+ i] por m6, m %+ i %assign i i+1 %endrep packsswb m6, m6 movq r5, m6 or r14, r5 %endif jz .ok mov r9, rax mov r10, rdx lea r0, [error_message] call puts mov r1, [rsp+max_args*8] mov dword [r1], 0 mov rdx, r10 mov rax, r9 .ok: RET %else ; just random numbers to reduce the chance of incidental match %define n3 dword 0x6549315c %define n4 dword 0xe02f3e23 %define n5 dword 0xb78d0d1d %define n6 dword 0x33627ba7 ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- cglobal checkasm_call, 2,7,0,-1*(((max_args+1)*4+STACK_ALIGNMENT-1) & ~(STACK_ALIGNMENT-1)) mov [esp+max_args*4], r1 %assign i 0 %rep max_args mov r1, [rstk+stack_offset+12+i*4] mov [esp+i*4], r1 %assign i i+1 %endrep mov r3, n3 mov r4, n4 mov r5, n5 mov r6, n6 call r0 xor r3, n3 xor r4, n4 xor r5, n5 xor r6, n6 or r3, r4 or r5, r6 or r3, r5 jz .ok mov r3, eax mov r4, edx lea r1, [error_message] mov [esp], r1 call puts mov r1, [esp+max_args*4] mov dword [r1], 0 mov edx, r4 mov eax, r3 .ok: REP_RET %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------- ; int x264_stack_pagealign( int (*func)(), int align ) ;----------------------------------------------------------------------------- cglobal stack_pagealign, 2,2 movsxdifnidn r1, r1d push rbp mov rbp, rsp %if WIN64 sub rsp, 32 ; shadow space %endif and rsp, ~0xfff sub rsp, r1 call r0 leave RET ; Trigger a warmup of vector units %macro WARMUP 0 cglobal checkasm_warmup, 0,0 xorps m0, m0 RET %endmacro INIT_YMM avx WARMUP INIT_ZMM avx512 WARMUP x264-master/tools/checkasm-aarch64.S000066400000000000000000000116311502133446700173510ustar00rootroot00000000000000/**************************************************************************** * checkasm-aarch64.S: assembly check tool ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "../common/aarch64/asm.S" const register_init, align=4 .quad 0x21f86d66c8ca00ce .quad 0x75b6ba21077c48ad .quad 0xed56bb2dcb3c7736 .quad 0x8bda43d3fd1a7e06 .quad 0xb64a9c9e5d318408 .quad 0xdf9a54b303f1d3a3 .quad 0x4a75479abd64e097 .quad 0x249214109d5d1c88 .quad 0x1a1b2550a612b48c .quad 0x79445c159ce79064 .quad 0x2eed899d5a28ddcd .quad 0x86b2536fcd8cf636 .quad 0xb0856806085e7943 .quad 0x3f2bf84fc0fcca4e .quad 0xacbd382dcf5b8de2 .quad 0xd229e1f5b281303f .quad 0x71aeaff20b095fd9 .quad 0xab63e2e11fa38ed9 endconst const error_message .asciz "failed to preserve register" endconst .text // max number of args used by any x264 asm function. #define MAX_ARGS 15 #define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15) function checkasm_stack_clobber, export=1 mov x3, sp mov x2, #CLOBBER_STACK 1: stp x0, x1, [sp, #-16]! subs x2, x2, #16 b.gt 1b mov sp, x3 ret endfunc #define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15) function checkasm_call, export=1 stp x29, x30, [sp, #-16]! mov x29, sp stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, x28, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! movrel x9, register_init ldp d8, d9, [x9], #16 ldp d10, d11, [x9], #16 ldp d12, d13, [x9], #16 ldp d14, d15, [x9], #16 ldp x19, x20, [x9], #16 ldp x21, x22, [x9], #16 ldp x23, x24, [x9], #16 ldp x25, x26, [x9], #16 ldp x27, x28, [x9], #16 str x1, [sp, #-16]! sub sp, sp, #ARG_STACK .equ pos, 0 .rept MAX_ARGS-8 // Skip the first 8 args, that are loaded into registers ldr x9, [x29, #16 + 8*8 + pos] str x9, [sp, #pos] .equ pos, pos + 8 .endr mov x12, x0 ldp x0, x1, [x29, #16] ldp x2, x3, [x29, #32] ldp x4, x5, [x29, #48] ldp x6, x7, [x29, #64] blr x12 add sp, sp, #ARG_STACK ldr x2, [sp] stp x0, x1, [sp] movrel x9, register_init movi v3.8h, #0 .macro check_reg_neon reg1, reg2 ldr q0, [x9], #16 uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d eor v0.16b, v0.16b, v1.16b orr v3.16b, v3.16b, v0.16b .endm check_reg_neon 8, 9 check_reg_neon 10, 11 check_reg_neon 12, 13 check_reg_neon 14, 15 uqxtn v3.8b, v3.8h umov x3, v3.d[0] .macro check_reg reg1, reg2 ldp x0, x1, [x9], #16 eor x0, x0, \reg1 eor x1, x1, \reg2 orr x3, x3, x0 orr x3, x3, x1 .endm check_reg x19, x20 check_reg x21, x22 check_reg x23, x24 check_reg x25, x26 check_reg x27, x28 cbz x3, 0f mov w9, #0 str w9, [x2] movrel x0, error_message bl EXT(puts) 0: ldp x0, x1, [sp], #16 ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x27, x28, [sp], #16 ldp x25, x26, [sp], #16 ldp x23, x24, [sp], #16 ldp x21, x22, [sp], #16 ldp x19, x20, [sp], #16 ldp x29, x30, [sp], #16 ret endfunc #if HAVE_SVE ENABLE_SVE function checkasm_sve_length, export=1 cntb x0 lsl x0, x0, #3 ret endfunc DISABLE_SVE #endif x264-master/tools/checkasm-arm.S000066400000000000000000000072001502133446700166750ustar00rootroot00000000000000/**************************************************************************** * checkasm-arm.S: assembly check tool ***************************************************************************** * Copyright (C) 2015-2025 x264 project * * Authors: Martin Storsjo * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "../common/arm/asm.S" const register_init, align=4 .quad 0x21f86d66c8ca00ce .quad 0x75b6ba21077c48ad .quad 0xed56bb2dcb3c7736 .quad 0x8bda43d3fd1a7e06 .quad 0xb64a9c9e5d318408 .quad 0xdf9a54b303f1d3a3 .quad 0x4a75479abd64e097 .quad 0x249214109d5d1c88 endconst const error_message .asciz "failed to preserve register" endconst .text @ max number of args used by any x264 asm function. #define MAX_ARGS 15 #define ARG_STACK 4*(MAX_ARGS - 4) @ align the used stack space to 8 to preserve the stack alignment #define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed) .macro clobbercheck variant .equ pushed, 4*10 function checkasm_call_\variant push {r4-r11, lr} .ifc \variant, neon vpush {q4-q7} .equ pushed, pushed + 16*4 .endif movrel r12, register_init .ifc \variant, neon vldm r12, {q4-q7} .endif ldm r12, {r4-r11} push {r1} sub sp, sp, #ARG_STACK_A .equ pos, 0 .rept MAX_ARGS-4 ldr r12, [sp, #ARG_STACK_A + pushed + 8 + pos] str r12, [sp, #pos] .equ pos, pos + 4 .endr mov r12, r0 mov r0, r2 mov r1, r3 ldrd r2, r3, [sp, #ARG_STACK_A + pushed] blx r12 add sp, sp, #ARG_STACK_A pop {r2} push {r0, r1} movrel r12, register_init .ifc \variant, neon vldm r12, {q0-q3} veor q0, q0, q4 veor q1, q1, q5 veor q2, q2, q6 veor q3, q3, q7 vorr q0, q0, q1 vorr q0, q0, q2 vorr q0, q0, q3 vorr d0, d0, d1 vrev64.32 d1, d0 vorr d0, d0, d1 vmov.32 r3, d0[0] .else mov r3, #0 .endif .macro check_reg reg1, reg2= ldrd r0, r1, [r12], #8 eor r0, r0, \reg1 orr r3, r3, r0 .ifnb \reg2 eor r1, r1, \reg2 orr r3, r3, r1 .endif .endm check_reg r4, r5 check_reg r6, r7 @ r9 is a volatile register in the ios ABI #if SYS_MACOSX check_reg r8 #else check_reg r8, r9 #endif check_reg r10, r11 .purgem check_reg cmp r3, #0 beq 0f mov r12, #0 str r12, [r2] movrel r0, error_message blx EXT(puts) 0: pop {r0, r1} .ifc \variant, neon vpop {q4-q7} .endif pop {r4-r11, pc} endfunc .endm clobbercheck neon clobbercheck noneon x264-master/tools/checkasm-loongarch.S000066400000000000000000000124001502133446700200700ustar00rootroot00000000000000/**************************************************************************** * checkasm-loongarch.S: assembly check tool ***************************************************************************** * Copyright (C) 2024-2025 x264 project * * Authors: Xiwei Gu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include "../common/loongarch/loongson_asm.S" const register_init, align=3 .quad 0x21f86d66c8ca00ce .quad 0x75b6ba21077c48ad .quad 0xed56bb2dcb3c7736 .quad 0x8bda43d3fd1a7e06 .quad 0xb64a9c9e5d318408 .quad 0xdf9a54b303f1d3a3 .quad 0x4a75479abd64e097 .quad 0x249214109d5d1c88 .quad 0x1a1b2550a612b48c .quad 0x79445c159ce79064 .quad 0x2eed899d5a28ddcd .quad 0x86b2536fcd8cf636 .quad 0xb0856806085e7943 .quad 0x3f2bf84fc0fcca4e .quad 0xacbd382dcf5b8de2 .quad 0xd229e1f5b281303f .quad 0x71aeaff20b095fd9 endconst const error_message .asciz "failed to preserve register" endconst .text // max number of args used by any x264 asm function. #define MAX_ARGS 15 #define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15) // Fill dirty data at stack space function x264_checkasm_stack_clobber move t0, sp addi.d t1, zero, CLOBBER_STACK 1: st.d a0, sp, 0x00 st.d a1, sp, -0x08 addi.d sp, sp, -0x10 addi.d t1, t1, -0x10 blt zero,t1, 1b move sp, t0 endfunc #define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15) function x264_checkasm_call // Saved s0 - s8, fs0 - fs7 move t4, sp addi.d sp, sp, -136 st.d s0, sp, 0 st.d s1, sp, 8 st.d s2, sp, 16 st.d s3, sp, 24 st.d s4, sp, 32 st.d s5, sp, 40 st.d s6, sp, 48 st.d s7, sp, 56 st.d s8, sp, 64 fst.d fs0, sp, 72 fst.d fs1, sp, 80 fst.d fs2, sp, 88 fst.d fs3, sp, 96 fst.d fs4, sp, 104 fst.d fs5, sp, 112 fst.d fs6, sp, 120 fst.d fs7, sp, 128 la.local t1, register_init ld.d s0, t1, 0 ld.d s1, t1, 8 ld.d s2, t1, 16 ld.d s3, t1, 24 ld.d s4, t1, 32 ld.d s5, t1, 40 ld.d s6, t1, 48 ld.d s7, t1, 56 ld.d s8, t1, 64 fld.d fs0, t1, 72 fld.d fs1, t1, 80 fld.d fs2, t1, 88 fld.d fs3, t1, 96 fld.d fs4, t1, 104 fld.d fs5, t1, 112 fld.d fs6, t1, 120 fld.d fs7, t1, 128 addi.d sp, sp, -16 st.d a1, sp, 0 // ok st.d ra, sp, 8 // Ret address addi.d sp, sp, -ARG_STACK addi.d t0, zero, 8*8 xor t1, t1, t1 .rept MAX_ARGS - 8 // Skip the first 8 args, that are loaded into registers ldx.d t2, t4, t0 stx.d t2, sp, t1 addi.d t0, t0, 8 addi.d t1, t1, 8 .endr move t3, a0 // Func ld.d a0, t4, 0 ld.d a1, t4, 8 ld.d a2, t4, 16 ld.d a3, t4, 24 ld.d a4, t4, 32 ld.d a5, t4, 40 ld.d a6, t4, 48 ld.d a7, t4, 56 jirl ra, t3, 0 addi.d sp, sp, ARG_STACK ld.d t2, sp, 0 // ok ld.d ra, sp, 8 // Ret address addi.d sp, sp, 16 la.local t1, register_init xor t3, t3, t3 .macro check_reg_gr reg1 ld.d t0, t1, 0 xor t0, $s\reg1, t0 or t3, t3, t0 addi.d t1, t1, 8 .endm check_reg_gr 0 check_reg_gr 1 check_reg_gr 2 check_reg_gr 3 check_reg_gr 4 check_reg_gr 5 check_reg_gr 6 check_reg_gr 7 check_reg_gr 8 .macro check_reg_fr reg1 ld.d t0, t1, 0 movfr2gr.d t4,$fs\reg1 xor t0, t0, t4 or t3, t3, t0 addi.d t1, t1, 8 .endm check_reg_fr 0 check_reg_fr 1 check_reg_fr 2 check_reg_fr 3 check_reg_fr 4 check_reg_fr 5 check_reg_fr 6 check_reg_fr 7 beqz t3, 0f st.d zero,t2, 0x00 // Set OK to 0 la.local a0, error_message addi.d sp, sp, -8 st.d ra, sp, 0 bl puts ld.d ra, sp, 0 addi.d sp, sp, 8 0: ld.d s0, sp, 0 ld.d s1, sp, 8 ld.d s2, sp, 16 ld.d s3, sp, 24 ld.d s4, sp, 32 ld.d s5, sp, 40 ld.d s6, sp, 48 ld.d s7, sp, 56 ld.d s8, sp, 64 fld.d fs0, sp, 72 fld.d fs1, sp, 80 fld.d fs2, sp, 88 fld.d fs3, sp, 96 fld.d fs4, sp, 104 fld.d fs5, sp, 112 fld.d fs6, sp, 120 fld.d fs7, sp, 128 addi.d sp, sp, 136 endfunc x264-master/tools/checkasm.c000066400000000000000000003647611502133446700161620ustar00rootroot00000000000000/***************************************************************************** * checkasm.c: assembly check tool ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #include #include "common/common.h" #include "encoder/macroblock.h" #ifdef _WIN32 #include #endif // GCC doesn't align stack variables on ARM, so use .bss #if ARCH_ARM #undef ALIGNED_16 #define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 ) #endif /* buf1, buf2: initialised to random data and shouldn't write into them */ static uint8_t *buf1, *buf2; /* buf3, buf4: used to store output */ static uint8_t *buf3, *buf4; /* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */ static pixel *pbuf1, *pbuf2; /* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */ static pixel *pbuf3, *pbuf4; #if BIT_DEPTH > 8 #define FMT_PIXEL "%04x" #else #define FMT_PIXEL "%02x" #endif #define X264_ISDIGIT(x) isdigit((unsigned char)(x)) static int quiet = 0; #define report( name ) { \ if( used_asm && !quiet ) \ fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \ if( !ok ) ret = -1; \ } #define BENCH_RUNS 2000 // tradeoff between accuracy and speed #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions #define MAX_CPUS 30 // number of different combinations of cpu flags // RAND_MAX is guaranteed to be at least 32767, to get 30 bits of random data, we'll call rand() twice #define rand30() (((rand() & 0x7fff) << 15) + (rand() & 0x7fff)) typedef struct { void *pointer; // just for detecting duplicates uint32_t cpu; uint64_t cycles; uint32_t den; } bench_t; typedef struct { char *name; bench_t vers[MAX_CPUS]; } bench_func_t; static int do_bench = 0; static int bench_pattern_len = 0; static const char *bench_pattern = ""; static char func_name[100]; static bench_func_t benchs[MAX_FUNCS]; static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" }; static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" }; static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" }; static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" }; static const char **intra_predict_8x8_names = intra_predict_4x4_names; static const char **intra_predict_8x16c_names = intra_predict_8x8c_names; #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ ) static inline uint32_t read_time(void) { uint32_t a = 0; #if HAVE_X86_INLINE_ASM asm volatile( "lfence \n" "rdtsc \n" : "=a"(a) :: "edx", "memory" ); #elif ARCH_PPC asm volatile( "mftb %0" : "=r"(a) :: "memory" ); #elif HAVE_ARM_INLINE_ASM // ARMv7 only asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" ); #elif ARCH_AARCH64 uint64_t b = 0; asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" ); a = b; #elif ARCH_MIPS asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" ); #elif ARCH_LOONGARCH uint32_t id = 0; asm volatile( "rdtimel.w %0, %1" : "=r"(a), "=r"(id) :: "memory" ); #endif return a; } static bench_t* get_bench( const char *name, uint32_t cpu ) { int i, j; for( i = 0; benchs[i].name && strcmp(name, benchs[i].name); i++ ) assert( i < MAX_FUNCS ); if( !benchs[i].name ) benchs[i].name = strdup( name ); if( !cpu ) return &benchs[i].vers[0]; for( j = 1; benchs[i].vers[j].cpu && benchs[i].vers[j].cpu != cpu; j++ ) assert( j < MAX_CPUS ); benchs[i].vers[j].cpu = cpu; return &benchs[i].vers[j]; } static int cmp_nop( const void *a, const void *b ) { return *(uint16_t*)a - *(uint16_t*)b; } static int cmp_bench( const void *a, const void *b ) { // asciibetical sort except preserving numbers const char *sa = ((bench_func_t*)a)->name; const char *sb = ((bench_func_t*)b)->name; for( ;; sa++, sb++ ) { if( !*sa && !*sb ) return 0; if( X264_ISDIGIT( *sa ) && X264_ISDIGIT( *sb ) && X264_ISDIGIT( sa[1] ) != X264_ISDIGIT( sb[1] ) ) return X264_ISDIGIT( sa[1] ) - X264_ISDIGIT( sb[1] ); if( *sa != *sb ) return *sa - *sb; } } static void print_bench(void) { uint16_t nops[10000]; int nfuncs, nop_time=0; for( int i = 0; i < 10000; i++ ) { uint32_t t = read_time(); nops[i] = read_time() - t; } qsort( nops, 10000, sizeof(uint16_t), cmp_nop ); for( int i = 500; i < 9500; i++ ) nop_time += nops[i]; nop_time /= 900; printf( "nop: %d\n", nop_time ); for( nfuncs = 0; nfuncs < MAX_FUNCS && benchs[nfuncs].name; nfuncs++ ); qsort( benchs, nfuncs, sizeof(bench_func_t), cmp_bench ); for( int i = 0; i < nfuncs; i++ ) for( int j = 0; j < MAX_CPUS && (!j || benchs[i].vers[j].cpu); j++ ) { int k; bench_t *b = &benchs[i].vers[j]; if( !b->den ) continue; for( k = 0; k < j && benchs[i].vers[k].pointer != b->pointer; k++ ); if( k < j ) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, #if ARCH_X86 || ARCH_X86_64 b->cpu&X264_CPU_AVX512 ? "avx512" : b->cpu&X264_CPU_AVX2 ? "avx2" : b->cpu&X264_CPU_BMI2 ? "bmi2" : b->cpu&X264_CPU_BMI1 ? "bmi1" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : b->cpu&X264_CPU_XOP ? "xop" : b->cpu&X264_CPU_AVX ? "avx" : b->cpu&X264_CPU_SSE42 ? "sse42" : b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : b->cpu&X264_CPU_LZCNT ? "lzcnt" : /* print sse2slow only if there's also a sse2fast version of the same func */ b->cpu&X264_CPU_SSE2_IS_SLOW && jcpu&X264_CPU_SSE2 ? "sse2" : b->cpu&X264_CPU_SSE ? "sse" : b->cpu&X264_CPU_MMX ? "mmx" : #elif ARCH_PPC b->cpu&X264_CPU_ALTIVEC ? "altivec" : #elif ARCH_ARM b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV6 ? "armv6" : #elif ARCH_AARCH64 b->cpu&X264_CPU_SVE2 ? "sve2" : b->cpu&X264_CPU_SVE ? "sve" : b->cpu&X264_CPU_I8MM ? "i8mm" : b->cpu&X264_CPU_DOTPROD ? "dotprod" : b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV8 ? "armv8" : #elif ARCH_MIPS b->cpu&X264_CPU_MSA ? "msa" : #elif ARCH_LOONGARCH b->cpu&X264_CPU_LASX ? "lasx" : b->cpu&X264_CPU_LSX ? "lsx" : #endif "c", #if ARCH_X86 || ARCH_X86_64 b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" : b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" : b->cpu&X264_CPU_SLOW_ATOM ? "_atom" : #elif ARCH_ARM b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : #endif "", (int64_t)(10*b->cycles/b->den - nop_time)/4 ); } } /* YMM and ZMM registers on x86 are turned off to save power when they haven't been * used for some period of time. When they are used there will be a "warmup" period * during which performance will be reduced and inconsistent which is problematic when * trying to benchmark individual functions. We can work around this by periodically * issuing "dummy" instructions that uses those registers to keep them powered on. */ static void (*simd_warmup_func)( void ) = NULL; #define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 ) #if HAVE_MMX int x264_stack_pagealign( int (*func)(), int align ); void x264_checkasm_warmup_avx( void ); void x264_checkasm_warmup_avx512( void ); /* detect when callee-saved regs aren't saved * needs an explicit asm check because it only sometimes crashes in normal use. */ intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); #else #define x264_stack_pagealign( func, align ) func() #endif #if HAVE_AARCH64 intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); #if HAVE_SVE int x264_checkasm_sve_length( void ); #endif #endif #if HAVE_ARMV6 intptr_t x264_checkasm_call_neon( intptr_t (*func)(), int *ok, ... ); intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... ); intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon; #endif #if ARCH_LOONGARCH intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); #endif #define call_c1(func,...) func(__VA_ARGS__) #if HAVE_MMX && ARCH_X86_64 /* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit. * This is done by clobbering the stack with junk around the stack pointer and calling the * assembly function through x264_checkasm_call with added dummy arguments which forces all * real arguments to be passed on the stack and not in registers. For 32-bit argument the * upper half of the 64-bit register location on the stack will now contain junk. Note that * this is dependent on compiler behaviour and that interrupts etc. at the wrong time may * overwrite the junk written to the stack so there's no guarantee that it will always * detect all functions that assumes zero-extension. */ void x264_checkasm_stack_clobber( uint64_t clobber, ... ); #define call_a1(func,...) ({ \ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \ simd_warmup(); \ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); }) #elif HAVE_AARCH64 && !defined(__APPLE__) void x264_checkasm_stack_clobber( uint64_t clobber, ... ); #define call_a1(func,...) ({ \ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); }) #elif HAVE_MMX || HAVE_ARMV6 #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ ) #elif ARCH_LOONGARCH && HAVE_LSX void x264_checkasm_stack_clobber( uint64_t clobber, ... ); #define call_a1(func,...) ({ \ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); }) #else #define call_a1 call_c1 #endif #if HAVE_ARMV6 #define call_a1_64(func,...) ((uint64_t (*)(intptr_t(*)(), int*, ...))x264_checkasm_call)( (intptr_t(*)())func, &ok, __VA_ARGS__ ) #else #define call_a1_64 call_a1 #endif #define call_bench(func,cpu,...)\ if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\ {\ uint64_t tsum = 0;\ int tcount = 0;\ call_a1(func, __VA_ARGS__);\ for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\ {\ simd_warmup();\ uint32_t t = read_time();\ func(__VA_ARGS__);\ func(__VA_ARGS__);\ func(__VA_ARGS__);\ func(__VA_ARGS__);\ t = read_time() - t;\ if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\ {\ tsum += t;\ tcount++;\ }\ }\ bench_t *b = get_bench( func_name, cpu );\ b->cycles += tsum;\ b->den += tcount;\ b->pointer = func;\ } /* for most functions, run benchmark and correctness test at the same time. * for those that modify their inputs, run the above macros separately */ #define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); }) #define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); }) #define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); }) #define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); }) #define call_a64(func,...) ({ call_a2(func,__VA_ARGS__); call_a1_64(func,__VA_ARGS__); }) static int check_pixel( uint32_t cpu_ref, uint32_t cpu_new ) { x264_pixel_function_t pixel_c; x264_pixel_function_t pixel_ref; x264_pixel_function_t pixel_asm; x264_predict_t predict_4x4[12]; x264_predict8x8_t predict_8x8[12]; x264_predict_8x8_filter_t predict_8x8_filter; ALIGNED_16( pixel edge[36] ); uint16_t cost_mv[32]; int ret = 0, ok, used_asm; x264_pixel_init( 0, &pixel_c ); x264_pixel_init( cpu_ref, &pixel_ref ); x264_pixel_init( cpu_new, &pixel_asm ); x264_predict_4x4_init( 0, predict_4x4 ); x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter ); predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); // maximize sum for( int i = 0; i < 256; i++ ) { int z = i|(i>>4); z ^= z>>2; z ^= z>>1; pbuf4[i] = -(z&1) & PIXEL_MAX; pbuf3[i] = ~pbuf4[i] & PIXEL_MAX; } // random pattern made of maxed pixel differences, in case an intermediate value overflows for( int i = 256; i < 0x1000; i++ ) { pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX; pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX; } #define TEST_PIXEL( name, align ) \ ok = 1, used_asm = 0; \ for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \ { \ int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] ); \ used_asm = 1; \ for( int j = 0; j < 64; j++ ) \ { \ intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \ res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \ res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ break; \ } \ } \ for( int j = 0; j < 0x1000 && ok; j += 256 ) \ { \ res_c = pixel_c .name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \ res_asm = pixel_asm.name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: overflow %d != %d\n", i, res_c, res_asm ); \ } \ } \ } \ } \ report( "pixel " #name " :" ); TEST_PIXEL( sad, 0 ); TEST_PIXEL( sad_aligned, 1 ); TEST_PIXEL( ssd, 1 ); TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 1 ); ok = 1, used_asm = 0; if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] ) { set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] ); used_asm = 1; for( int j = 0; j < 64; j++ ) { uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 ); uint64_t res_a = call_a64( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 ); uint32_t cost8_a = res_a; uint32_t cost4_a = res_a >> 32; if( cost8_a != cost8_c || cost4_a != cost4_c ) { ok = 0; fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16, cost8_c, cost4_c, cost8_a, cost4_a ); break; } } for( int j = 0; j < 0x1000 && ok; j += 256 ) \ { uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 ); uint32_t cost8_a = res_a; uint32_t cost4_a = res_a >> 32; if( cost8_a != cost8_c || cost4_a != cost4_c ) { ok = 0; fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16, cost8_c, cost4_c, cost8_a, cost4_a ); } } } report( "pixel sa8d_satd :" ); #define TEST_PIXEL_X( N ) \ ok = 1; used_asm = 0; \ for( int i = 0; i < 7; i++ ) \ { \ ALIGNED_16( int res_c[4] ) = {0}; \ ALIGNED_16( int res_asm[4] ) = {0}; \ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ { \ set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \ used_asm = 1; \ for( int j = 0; j < 64; j++ ) \ { \ pixel *pix2 = pbuf2+j; \ res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \ res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \ res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \ if( N == 4 ) \ { \ res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \ call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \ } \ else \ call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \ if( memcmp(res_c, res_asm, N*sizeof(int)) ) \ { \ ok = 0; \ fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \ i, res_c[0], res_c[1], res_c[2], res_c[3], \ res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ } \ if( N == 4 ) \ call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \ else \ call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \ } \ } \ } \ report( "pixel sad_x"#N" :" ); TEST_PIXEL_X(3); TEST_PIXEL_X(4); #define TEST_PIXEL_VAR( i ) \ if( pixel_asm.var[i] != pixel_ref.var[i] ) \ { \ set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \ call_c1( pixel_c.var[i], pbuf1, 16 ); \ call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \ uint64_t res_c = pixel_c.var[i]( pbuf1, 16 ); \ uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \ } \ call_c2( pixel_c.var[i], pbuf1, (intptr_t)16 ); \ call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \ } ok = 1; used_asm = 0; TEST_PIXEL_VAR( PIXEL_16x16 ); TEST_PIXEL_VAR( PIXEL_8x16 ); TEST_PIXEL_VAR( PIXEL_8x8 ); report( "pixel var :" ); #define TEST_PIXEL_VAR2( i ) \ if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \ { \ int res_c, res_asm; \ ALIGNED_ARRAY_8( int, ssd_c, [2] ); \ ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \ set_func_name( "%s_%s", "var2", pixel_names[i] ); \ used_asm = 1; \ res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \ res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \ if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \ { \ ok = 0; \ fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \ } \ } ok = 1; used_asm = 0; TEST_PIXEL_VAR2( PIXEL_8x16 ); TEST_PIXEL_VAR2( PIXEL_8x8 ); report( "pixel var2 :" ); ok = 1; used_asm = 0; for( int i = 0; i < 4; i++ ) if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] ) { set_func_name( "hadamard_ac_%s", pixel_names[i] ); used_asm = 1; for( int j = 0; j < 32; j++ ) { pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256; call_c1( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 ); call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 ); uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 ); uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 ); if( rc != ra ) { ok = 0; fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) ); break; } } call_c2( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 ); call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 ); } report( "pixel hadamard_ac :" ); // maximize sum for( int i = 0; i < 32; i++ ) for( int j = 0; j < 16; j++ ) pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX; ok = 1; used_asm = 0; if( pixel_asm.vsad != pixel_ref.vsad ) { for( int h = 2; h <= 32; h += 2 ) { int res_c, res_asm; set_func_name( "vsad" ); used_asm = 1; for( int j = 0; j < 2 && ok; j++ ) { pixel *p = j ? pbuf4 : pbuf1; res_c = call_c( pixel_c.vsad, p, (intptr_t)16, h ); res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h ); if( res_c != res_asm ) { ok = 0; fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm ); break; } } } } report( "pixel vsad :" ); ok = 1; used_asm = 0; if( pixel_asm.asd8 != pixel_ref.asd8 ) { set_func_name( "asd8" ); used_asm = 1; int res_c = call_c( pixel_c.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 ); int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 ); if( res_c != res_a ) { ok = 0; fprintf( stderr, "asd: %d != %d\n", res_c, res_a ); } } report( "pixel asd :" ); #define TEST_INTRA_X3( name, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ ALIGNED_16( int res_c[4] ); \ ALIGNED_16( int res_asm[4] ); \ set_func_name( #name ); \ used_asm = 1; \ call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \ call_a( pixel_asm.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_asm ); \ if( memcmp(res_c, res_asm, 3 * sizeof(*res_c)) ) \ { \ ok = 0; \ fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \ res_c[0], res_c[1], res_c[2], \ res_asm[0], res_asm[1], res_asm[2] ); \ } \ } #define TEST_INTRA_X9( name, cmp ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ set_func_name( #name ); \ used_asm = 1; \ ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \ for( int i=0; i<17; i++ ) \ bitcosts[i] = 9*(i!=8); \ memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*SIZEOF_PIXEL ); \ memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*SIZEOF_PIXEL ); \ for( int i=0; i<32; i++ ) \ { \ pixel *fenc = pbuf1+48+i*12; \ pixel *fdec1 = pbuf3+48+i*12; \ pixel *fdec2 = pbuf4+48+i*12; \ int pred_mode = i%9; \ int res_c = INT_MAX; \ for( int j=0; j<9; j++ ) \ { \ predict_4x4[j]( fdec1 ); \ int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \ if( cost < (uint16_t)res_c ) \ res_c = cost + (j<<16); \ } \ predict_4x4[res_c>>16]( fdec1 ); \ int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \ if( res_c != res_a ) \ { \ ok = 0; \ fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \ break; \ } \ if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*SIZEOF_PIXEL) ) \ { \ ok = 0; \ fprintf( stderr, #name" [FAILED]\n" ); \ for( int j=0; j<16; j++ ) \ fprintf( stderr, FMT_PIXEL" ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \ fprintf( stderr, "\n" ); \ for( int j=0; j<16; j++ ) \ fprintf( stderr, FMT_PIXEL" ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \ fprintf( stderr, "\n" ); \ break; \ } \ } \ } #define TEST_INTRA8_X9( name, cmp ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \ set_func_name( #name ); \ used_asm = 1; \ ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \ ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \ ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \ memset( satds_c, 0, 16 * sizeof(*satds_c) ); \ memset( satds_a, 0, 16 * sizeof(*satds_a) ); \ for( int i=0; i<17; i++ ) \ bitcosts[i] = 9*(i!=8); \ for( int i=0; i<32; i++ ) \ { \ pixel *fenc = pbuf1+48+i*12; \ pixel *fdec1 = pbuf3+48+i*12; \ pixel *fdec2 = pbuf4+48+i*12; \ int pred_mode = i%9; \ int res_c = INT_MAX; \ predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \ for( int j=0; j<9; j++ ) \ { \ predict_8x8[j]( fdec1, edge ); \ satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \ if( satds_c[j] < (uint16_t)res_c ) \ res_c = satds_c[j] + (j<<16); \ } \ predict_8x8[res_c>>16]( fdec1, edge ); \ int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \ if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \ { \ ok = 0; \ fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \ for( int j = 0; j < 9; j++ ) \ fprintf( stderr, "%5d ", satds_c[j]); \ fprintf( stderr, "\n" ); \ for( int j = 0; j < 9; j++ ) \ fprintf( stderr, "%5d ", satds_a[j]); \ fprintf( stderr, "\n" ); \ break; \ } \ for( int j=0; j<8; j++ ) \ if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*SIZEOF_PIXEL) ) \ ok = 0; \ if( !ok ) \ { \ fprintf( stderr, #name" [FAILED]\n" ); \ for( int j=0; j<8; j++ ) \ { \ for( int k=0; k<8; k++ ) \ fprintf( stderr, FMT_PIXEL" ", fdec1[k+j*FDEC_STRIDE] ); \ fprintf( stderr, "\n" ); \ } \ fprintf( stderr, "\n" ); \ for( int j=0; j<8; j++ ) \ { \ for( int k=0; k<8; k++ ) \ fprintf( stderr, FMT_PIXEL" ", fdec2[k+j*FDEC_STRIDE] ); \ fprintf( stderr, "\n" ); \ } \ fprintf( stderr, "\n" ); \ break; \ } \ } \ } memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*SIZEOF_PIXEL ); ok = 1; used_asm = 0; TEST_INTRA_X3( intra_satd_x3_16x16, 0 ); TEST_INTRA_X3( intra_satd_x3_8x16c, 0 ); TEST_INTRA_X3( intra_satd_x3_8x8c, 0 ); TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge ); TEST_INTRA_X3( intra_satd_x3_4x4, 0 ); report( "intra satd_x3 :" ); ok = 1; used_asm = 0; TEST_INTRA_X3( intra_sad_x3_16x16, 0 ); TEST_INTRA_X3( intra_sad_x3_8x16c, 0 ); TEST_INTRA_X3( intra_sad_x3_8x8c, 0 ); TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge ); TEST_INTRA_X3( intra_sad_x3_4x4, 0 ); report( "intra sad_x3 :" ); ok = 1; used_asm = 0; TEST_INTRA_X9( intra_satd_x9_4x4, satd ); TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d ); report( "intra satd_x9 :" ); ok = 1; used_asm = 0; TEST_INTRA_X9( intra_sad_x9_4x4, sad ); TEST_INTRA8_X9( intra_sad_x9_8x8, sad ); report( "intra sad_x9 :" ); ok = 1; used_asm = 0; if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core ) { used_asm = 1; set_func_name( "ssd_nv12" ); uint64_t res_u_c, res_v_c, res_u_a, res_v_a; for( int w = 8; w <= 360; w += 8 ) { pixel_c.ssd_nv12_core( pbuf1, 368, pbuf2, 368, w, 8, &res_u_c, &res_v_c ); pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, w, 8, &res_u_a, &res_v_a ); if( res_u_c != res_u_a || res_v_c != res_v_a ) { ok = 0; fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n", res_u_c, res_v_c, res_u_a, res_v_a ); } } call_c( pixel_c.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c ); call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a ); } report( "ssd_nv12 :" ); if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core || pixel_asm.ssim_end4 != pixel_ref.ssim_end4 ) { int cnt; float res_c, res_a; ALIGNED_16( int sums[5][4] ) = {{0}}; used_asm = ok = 1; x264_emms(); res_c = x264_pixel_ssim_wxh( &pixel_c, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt ); res_a = x264_pixel_ssim_wxh( &pixel_asm, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt ); if( fabs( res_c - res_a ) > 1e-5 ) { ok = 0; fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); } set_func_name( "ssim_core" ); call_c( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); set_func_name( "ssim_end" ); call_c2( pixel_c.ssim_end4, sums, sums, 4 ); call_a2( pixel_asm.ssim_end4, sums, sums, 4 ); /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */ call_c1( pixel_c.ssim_end4, sums, sums, 3 ); call_a1( pixel_asm.ssim_end4, sums, sums, 3 ); report( "ssim :" ); } ok = 1; used_asm = 0; for( int i = 0; i < 32; i++ ) cost_mv[i] = rand30() & 0xffff; for( int i = 0; i < 100 && ok; i++ ) if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] ) { ALIGNED_16( uint16_t sums[72] ); ALIGNED_16( int dc[4] ); ALIGNED_16( int16_t mvs_a[48] ); ALIGNED_16( int16_t mvs_c[48] ); int mvn_a, mvn_c; int thresh = (rand() % 257) * PIXEL_MAX + (rand30() & 0xffff); set_func_name( "esa_ads_%s", pixel_names[i&3] ); if( i < 40 ) { for( int j = 0; j < 72; j++ ) sums[j] = (rand() % 9) * 8 * PIXEL_MAX; for( int j = 0; j < 4; j++ ) dc[j] = (rand() % 9) * 8 * PIXEL_MAX; } else { #if BIT_DEPTH + 6 > 15 for( int j = 0; j < 72; j++ ) sums[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1); for( int j = 0; j < 4; j++ ) dc[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1); #else for( int j = 0; j < 72; j++ ) sums[j] = rand() & ((1 << (BIT_DEPTH + 6))-1); for( int j = 0; j < 4; j++ ) dc[j] = rand() & ((1 << (BIT_DEPTH + 6))-1); #endif } used_asm = 1; mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh ); mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh ); if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) ) { ok = 0; fprintf( stderr, "thresh: %d\n", thresh ); fprintf( stderr, "c%d: ", i&3 ); for( int j = 0; j < mvn_c; j++ ) fprintf( stderr, "%d ", mvs_c[j] ); fprintf( stderr, "\na%d: ", i&3 ); for( int j = 0; j < mvn_a; j++ ) fprintf( stderr, "%d ", mvs_a[j] ); fprintf( stderr, "\n\n" ); } } report( "esa ads:" ); return ret; } static int check_dct( uint32_t cpu_ref, uint32_t cpu_new ) { x264_dct_function_t dct_c; x264_dct_function_t dct_ref; x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] ); ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] ); ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] ); ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; x264_dct_init( 0, &dct_c ); x264_dct_init( cpu_ref, &dct_ref); x264_dct_init( cpu_new, &dct_asm ); memset( h, 0, sizeof(*h) ); x264_param_default( &h->param ); h->sps->i_chroma_format_idc = 1; h->chroma_qp_table = i_chroma_qp_table + 12; h->param.analyse.i_luma_deadzone[0] = 0; h->param.analyse.i_luma_deadzone[1] = 0; h->param.analyse.b_transform_8x8 = 1; for( int i = 0; i < 8; i++ ) h->sps->scaling_list[i] = x264_cqm_flat16; x264_cqm_init( h ); x264_quant_init( h, 0, &qf ); /* overflow test cases */ for( int i = 0; i < 5; i++ ) { pixel *enc = &pbuf3[16*i*FENC_STRIDE]; pixel *dec = &pbuf4[16*i*FDEC_STRIDE]; for( int j = 0; j < 16; j++ ) { int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1)); int cond_b = (i == 0) ? 1 : !cond_a; enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0; enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0; for( int k = 0; k < 4; k++ ) dec[k] = PIXEL_MAX - enc[k]; enc += FENC_STRIDE; dec += FDEC_STRIDE; } } #define TEST_DCT( name, t1, t2, size ) \ if( dct_asm.name != dct_ref.name ) \ { \ set_func_name( #name ); \ used_asm = 1; \ pixel *enc = pbuf3; \ pixel *dec = pbuf4; \ for( int j = 0; j < 5; j++) \ { \ call_c( dct_c.name, t1, &pbuf1[j*64], &pbuf2[j*64] ); \ call_a( dct_asm.name, t2, &pbuf1[j*64], &pbuf2[j*64] ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ for( int k = 0; k < size; k++ )\ fprintf( stderr, "%d ", ((dctcoef*)t1)[k] );\ fprintf( stderr, "\n" );\ for( int k = 0; k < size; k++ )\ fprintf( stderr, "%d ", ((dctcoef*)t2)[k] );\ fprintf( stderr, "\n" );\ break; \ } \ call_c( dct_c.name, t1, enc, dec ); \ call_a( dct_asm.name, t2, enc, dec ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED] (overflow)\n" ); \ break; \ } \ enc += 16*FENC_STRIDE; \ dec += 16*FDEC_STRIDE; \ } \ } ok = 1; used_asm = 0; TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 ); TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 ); TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 ); TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 ); TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 ); report( "sub_dct4 :" ); ok = 1; used_asm = 0; TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 ); TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 ); report( "sub_dct8 :" ); #undef TEST_DCT // fdct and idct are denormalized by different factors, so quant/dequant // is needed to force the coefs into the right range. dct_c.sub16x16_dct( dct4, pbuf1, pbuf2 ); dct_c.sub16x16_dct8( dct8, pbuf1, pbuf2 ); for( int i = 0; i < 16; i++ ) { qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] ); qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 ); } for( int i = 0; i < 4; i++ ) { qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] ); qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 ); } x264_cqm_delete( h ); #define TEST_IDCT( name, src ) \ if( dct_asm.name != dct_ref.name ) \ { \ set_func_name( #name ); \ used_asm = 1; \ memcpy( pbuf3, pbuf1, 32*32 * SIZEOF_PIXEL ); \ memcpy( pbuf4, pbuf1, 32*32 * SIZEOF_PIXEL ); \ memcpy( dct1, src, 256 * sizeof(dctcoef) ); \ memcpy( dct2, src, 256 * sizeof(dctcoef) ); \ call_c1( dct_c.name, pbuf3, (void*)dct1 ); \ call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \ if( memcmp( pbuf3, pbuf4, 32*32 * SIZEOF_PIXEL ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ call_c2( dct_c.name, pbuf3, (void*)dct1 ); \ call_a2( dct_asm.name, pbuf4, (void*)dct2 ); \ } ok = 1; used_asm = 0; TEST_IDCT( add4x4_idct, dct4 ); TEST_IDCT( add8x8_idct, dct4 ); TEST_IDCT( add8x8_idct_dc, dct4 ); TEST_IDCT( add16x16_idct, dct4 ); TEST_IDCT( add16x16_idct_dc, dct4 ); report( "add_idct4 :" ); ok = 1; used_asm = 0; TEST_IDCT( add8x8_idct8, dct8 ); TEST_IDCT( add16x16_idct8, dct8 ); report( "add_idct8 :" ); #undef TEST_IDCT #define TEST_DCTDC( name )\ ok = 1; used_asm = 0;\ if( dct_asm.name != dct_ref.name )\ {\ set_func_name( #name );\ used_asm = 1;\ uint16_t *p = (uint16_t*)buf1;\ for( int i = 0; i < 16 && ok; i++ )\ {\ for( int j = 0; j < 16; j++ )\ dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\ : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\ : ((*p++)&0x1fff)-0x1000; /* general case */\ memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\ call_c1( dct_c.name, dct1[0] );\ call_a1( dct_asm.name, dct2[0] );\ if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\ ok = 0;\ }\ call_c2( dct_c.name, dct1[0] );\ call_a2( dct_asm.name, dct2[0] );\ }\ report( #name " :" ); TEST_DCTDC( dct4x4dc ); TEST_DCTDC( idct4x4dc ); #undef TEST_DCTDC #define TEST_DCTDC_CHROMA( name )\ ok = 1; used_asm = 0;\ if( dct_asm.name != dct_ref.name )\ {\ set_func_name( #name );\ used_asm = 1;\ uint16_t *p = (uint16_t*)buf1;\ for( int i = 0; i < 16 && ok; i++ )\ {\ for( int j = 0; j < 8; j++ )\ dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\ : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\ : ((*p++)&0x1fff)-0x1000; /* general case */\ memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\ call_c1( dct_c.name, dctdc[0], dct1 );\ call_a1( dct_asm.name, dctdc[1], dct2 );\ if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\ {\ ok = 0;\ fprintf( stderr, #name " [FAILED]\n" ); \ }\ }\ call_c2( dct_c.name, dctdc[0], dct1 );\ call_a2( dct_asm.name, dctdc[1], dct2 );\ }\ report( #name " :" ); TEST_DCTDC_CHROMA( dct2x4dc ); #undef TEST_DCTDC_CHROMA x264_zigzag_function_t zigzag_c[2]; x264_zigzag_function_t zigzag_ref[2]; x264_zigzag_function_t zigzag_asm[2]; ALIGNED_ARRAY_64( dctcoef, level1,[64] ); ALIGNED_ARRAY_64( dctcoef, level2,[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ for( int i = 0; i < size*size; i++ ) \ dct[i] = i; \ call_c( zigzag_c[interlace].name, t1, dct ); \ call_a( zigzag_asm[interlace].name, t2, dct ); \ if( memcmp( t1, t2, size*size*sizeof(dctcoef) ) ) \ { \ ok = 0; \ for( int i = 0; i < 2; i++ ) \ { \ dctcoef *d = (dctcoef*)(i ? t2 : t1); \ for( int j = 0; j < size; j++ ) \ { \ for( int k = 0; k < size; k++ ) \ fprintf( stderr, "%2d ", d[k+j*8] ); \ fprintf( stderr, "\n" ); \ } \ fprintf( stderr, "\n" ); \ } \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ } #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * SIZEOF_PIXEL ); \ memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * SIZEOF_PIXEL ); \ nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*SIZEOF_PIXEL ) || nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \ call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \ } #define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ int nz_a, nz_c; \ dctcoef dc_a, dc_c; \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ for( int i = 0; i < 2; i++ ) \ { \ memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * SIZEOF_PIXEL ); \ memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * SIZEOF_PIXEL ); \ for( int j = 0; j < 4; j++ ) \ { \ memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * SIZEOF_PIXEL ); \ memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * SIZEOF_PIXEL ); \ } \ nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * SIZEOF_PIXEL ) || nz_c != nz_a || dc_c != dc_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ break; \ } \ } \ call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \ call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \ } #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ { \ for( int j = 0; j < 100; j++ ) \ { \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ memcpy(dct, buf1, size*sizeof(dctcoef)); \ for( int i = 0; i < size; i++ ) \ dct[i] = rand()&0x1F ? 0 : dct[i]; \ memcpy(buf3, buf4, 10); \ call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \ call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \ if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \ { \ ok = 0; \ fprintf( stderr, "%d: %d %d %d %d\n%d %d %d %d\n\n", memcmp( t1, t2, size*sizeof(dctcoef) ), buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9] ); \ break; \ } \ } \ } x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] ); x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] ); x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] ); ok = 1; used_asm = 0; TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 ); report( "zigzag_interleave :" ); for( interlace = 0; interlace <= 1; interlace++ ) { ok = 1; used_asm = 0; TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 ); TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 ); TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 ); TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 ); report( interlace ? "zigzag_field :" : "zigzag_frame :" ); } #undef TEST_ZIGZAG_SCAN #undef TEST_ZIGZAG_SUB return ret; } static int check_mc( uint32_t cpu_ref, uint32_t cpu_new ) { x264_mc_functions_t mc_c; x264_mc_functions_t mc_ref; x264_mc_functions_t mc_a; x264_pixel_function_t pixf; pixel *src = &(pbuf1)[2*64+2]; pixel *src2[4] = { &(pbuf1)[3*64+2], &(pbuf1)[5*64+2], &(pbuf1)[7*64+2], &(pbuf1)[9*64+2] }; pixel *dst1 = pbuf3; pixel *dst2 = pbuf4; int ret = 0, ok, used_asm; x264_mc_init( 0, &mc_c, 0 ); x264_mc_init( cpu_ref, &mc_ref, 0 ); x264_mc_init( cpu_new, &mc_a, 0 ); x264_pixel_init( 0, &pixf ); #define MC_TEST_LUMA( w, h ) \ if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \ { \ const x264_weight_t *weight = x264_weight_none; \ set_func_name( "mc_luma_%dx%d", w, h ); \ used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \ call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \ if( memcmp( pbuf3, pbuf4, 1024 * SIZEOF_PIXEL ) ) \ { \ fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ ok = 0; \ } \ } \ if( mc_a.get_ref != mc_ref.get_ref ) \ { \ pixel *ref = dst2; \ intptr_t ref_stride = 32; \ int w_checked = ( ( SIZEOF_PIXEL == 2 && (w == 12 || w == 20)) ? w-2 : w ); \ const x264_weight_t *weight = x264_weight_none; \ set_func_name( "get_ref_%dx%d", w_checked, h ); \ used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \ ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \ for( int i = 0; i < h; i++ ) \ if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * SIZEOF_PIXEL ) ) \ { \ fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w_checked, h ); \ ok = 0; \ break; \ } \ } #define MC_TEST_CHROMA( w, h ) \ if( mc_a.mc_chroma != mc_ref.mc_chroma ) \ { \ set_func_name( "mc_chroma_%dx%d", w, h ); \ used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \ call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \ for( int j = 0; j < h; j++ ) \ for( int i = w; i < 8; i++ ) \ { \ dst2[i+j*16+8] = dst1[i+j*16+8]; \ dst2[i+j*16 ] = dst1[i+j*16 ]; \ } \ if( memcmp( pbuf3, pbuf4, 1024 * SIZEOF_PIXEL ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ ok = 0; \ } \ } ok = 1; used_asm = 0; for( int dy = -8; dy < 8; dy++ ) for( int dx = -128; dx < 128; dx++ ) { if( rand()&15 ) continue; // running all of them is too slow MC_TEST_LUMA( 20, 18 ); MC_TEST_LUMA( 16, 16 ); MC_TEST_LUMA( 16, 8 ); MC_TEST_LUMA( 12, 10 ); MC_TEST_LUMA( 8, 16 ); MC_TEST_LUMA( 8, 8 ); MC_TEST_LUMA( 8, 4 ); MC_TEST_LUMA( 4, 8 ); MC_TEST_LUMA( 4, 4 ); } report( "mc luma :" ); ok = 1; used_asm = 0; for( int dy = -1; dy < 9; dy++ ) for( int dx = -128; dx < 128; dx++ ) { if( rand()&15 ) continue; MC_TEST_CHROMA( 8, 8 ); MC_TEST_CHROMA( 8, 4 ); MC_TEST_CHROMA( 4, 8 ); MC_TEST_CHROMA( 4, 4 ); MC_TEST_CHROMA( 4, 2 ); MC_TEST_CHROMA( 2, 4 ); MC_TEST_CHROMA( 2, 2 ); } report( "mc chroma :" ); #undef MC_TEST_LUMA #undef MC_TEST_CHROMA #define MC_TEST_AVG( name, weight ) \ { \ for( int i = 0; i < 12; i++ ) \ { \ memcpy( pbuf3, pbuf1+320, 320 * SIZEOF_PIXEL ); \ memcpy( pbuf4, pbuf1+320, 320 * SIZEOF_PIXEL ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] ); \ used_asm = 1; \ call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ if( memcmp( pbuf3, pbuf4, 320 * SIZEOF_PIXEL ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ } \ call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ } \ } \ } ok = 1, used_asm = 0; for( int w = -63; w <= 127 && ok; w++ ) MC_TEST_AVG( avg, w ); report( "mc wpredb :" ); #define MC_TEST_WEIGHT( name, weight, aligned ) \ int align_off = (aligned ? 0 : rand()%16); \ for( int i = 1; i <= 5; i++ ) \ { \ ALIGNED_16( pixel buffC[640] ); \ ALIGNED_16( pixel buffA[640] ); \ int j = X264_MAX( i*4, 2 ); \ memset( buffC, 0, 640 * SIZEOF_PIXEL ); \ memset( buffA, 0, 640 * SIZEOF_PIXEL ); \ x264_t ha; \ ha.mc = mc_a; \ /* w12 is the same as w16 in some cases */ \ if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \ continue; \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_w%d", #name, j ); \ used_asm = 1; \ call_c1( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ mc_a.weight_cache(&ha, &weight); \ call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ for( int k = 0; k < 16; k++ ) \ if( memcmp( &buffC[k*32], &buffA[k*32], j * SIZEOF_PIXEL ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \ break; \ } \ /* omit unlikely high scales for benchmarking */ \ if( (s << (8-d)) < 512 ) \ { \ call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ } \ } \ } ok = 1; used_asm = 0; int align_cnt = 0; for( int s = 0; s <= 127 && ok; s++ ) { for( int o = -128; o <= 127 && ok; o++ ) { if( rand() & 2047 ) continue; for( int d = 0; d <= 7 && ok; d++ ) { if( s == 1<> 1; int h = plane_specs[i].h; intptr_t src_stride = plane_specs[i].src_stride; intptr_t dst_stride = (2*w + 127) & ~63; assert( dst_stride * h <= 0x1000 ); pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); memset( pbuf3, 0, 0x1000*SIZEOF_PIXEL ); memset( pbuf4, 0, 0x1000*SIZEOF_PIXEL ); call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h ); call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h ); for( int y = 0; y < h; y++ ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*SIZEOF_PIXEL ) ) { ok = 0; fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } } if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) { set_func_name( "plane_copy_interleave" ); used_asm = 1; for( int i = 0; i < ARRAY_ELEMS(plane_specs); i++ ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1; intptr_t dst_stride = (2*w + 127) & ~63; assert( dst_stride * h <= 0x1000 ); pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); memset( pbuf3, 0, 0x1000*SIZEOF_PIXEL ); memset( pbuf4, 0, 0x1000*SIZEOF_PIXEL ); call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h ); call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h ); for( int y = 0; y < h; y++ ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*SIZEOF_PIXEL ) ) { ok = 0; fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } } if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave ) { set_func_name( "plane_copy_deinterleave" ); used_asm = 1; for( int i = 0; i < ARRAY_ELEMS(plane_specs); i++ ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; intptr_t dst_stride = w; intptr_t src_stride = (2*w + 127) & ~63; intptr_t offv = (dst_stride*h + 63) & ~31; memset( pbuf3, 0, 0x1000 ); memset( pbuf4, 0, 0x1000 ); call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h ); call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h ); for( int y = 0; y < h; y++ ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w ) || memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) ) { ok = 0; fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } } if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv ) { set_func_name( "plane_copy_deinterleave_yuyv" ); used_asm = 1; for( int i = 0; i < ARRAY_ELEMS(plane_specs); i++ ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; intptr_t dst_stride = ALIGN( w, 32/SIZEOF_PIXEL ); intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1; intptr_t offv = dst_stride*h; pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); memset( pbuf3, 0, 0x1000 ); memset( pbuf4, 0, 0x1000 ); /* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */ call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h ); call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h ); for( int y = 0; y < h; y++ ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*SIZEOF_PIXEL ) || memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*SIZEOF_PIXEL ) ) { fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } } if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb ) { set_func_name( "plane_copy_deinterleave_rgb" ); used_asm = 1; for( int i = 0; i < ARRAY_ELEMS(plane_specs); i++ ) { int w = (plane_specs[i].w + 2) >> 2; int h = plane_specs[i].h; intptr_t src_stride = plane_specs[i].src_stride; intptr_t dst_stride = ALIGN( w, 16 ); intptr_t offv = dst_stride*h + 16; pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); for( int pw = 3; pw <= 4; pw++ ) { memset( pbuf3, 0, 0x1000 ); memset( pbuf4, 0, 0x1000 ); call_c( mc_c.plane_copy_deinterleave_rgb, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf3+2*offv, dst_stride, src1, src_stride, pw, w, h ); call_a( mc_a.plane_copy_deinterleave_rgb, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf4+2*offv, dst_stride, src1, src_stride, pw, w, h ); for( int y = 0; y < h; y++ ) if( memcmp( pbuf3+y*dst_stride+0*offv, pbuf4+y*dst_stride+0*offv, w ) || memcmp( pbuf3+y*dst_stride+1*offv, pbuf4+y*dst_stride+1*offv, w ) || memcmp( pbuf3+y*dst_stride+2*offv, pbuf4+y*dst_stride+2*offv, w ) ) { ok = 0; fprintf( stderr, "plane_copy_deinterleave_rgb FAILED: w=%d h=%d stride=%d pw=%d\n", w, h, (int)src_stride, pw ); break; } } } } report( "plane_copy :" ); if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 ) { set_func_name( "plane_copy_deinterleave_v210" ); ok = 1; used_asm = 1; for( int i = 0; i < ARRAY_ELEMS(plane_specs); i++ ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; intptr_t dst_stride = ALIGN( w, 32 ); intptr_t src_stride = (w + 47) / 48 * 128 / (int)sizeof(uint32_t); intptr_t offv = dst_stride*h + 32; memset( pbuf3, 0, 0x1000 ); memset( pbuf4, 0, 0x1000 ); call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h ); call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h ); for( int y = 0; y < h; y++ ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(uint16_t) ) || memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) ) { ok = 0; fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } report( "v210 :" ); } if( mc_a.hpel_filter != mc_ref.hpel_filter ) { pixel *srchpel = pbuf1+8+2*64; pixel *dstc[3] = { pbuf3+8, pbuf3+8+16*64, pbuf3+8+32*64 }; pixel *dsta[3] = { pbuf4+8, pbuf4+8+16*64, pbuf4+8+32*64 }; void *tmp = pbuf3+49*64; set_func_name( "hpel_filter" ); ok = 1; used_asm = 1; memset( pbuf3, 0, 4096 * SIZEOF_PIXEL ); memset( pbuf4, 0, 4096 * SIZEOF_PIXEL ); call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp ); call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp ); for( int i = 0; i < 3; i++ ) for( int j = 0; j < 10; j++ ) //FIXME ideally the first pixels would match too, but they aren't actually used if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 * SIZEOF_PIXEL ) ) { ok = 0; fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j ); for( int k = 0; k < 48; k++ ) fprintf( stderr, FMT_PIXEL"%s", dstc[i][j*64+k], (k+1)&3 ? "" : " " ); fprintf( stderr, "\n" ); for( int k = 0; k < 48; k++ ) fprintf( stderr, FMT_PIXEL"%s", dsta[i][j*64+k], (k+1)&3 ? "" : " " ); fprintf( stderr, "\n" ); break; } report( "hpel filter :" ); } if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) { pixel *dstc[4] = { pbuf3, pbuf3+1024, pbuf3+2048, pbuf3+3072 }; pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 }; set_func_name( "lowres_init" ); ok = 1; used_asm = 1; for( int w = 96; w <= 96+24; w += 8 ) { intptr_t stride = (w*2+31)&~31; intptr_t stride_lowres = (w+31)&~31; call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 ); call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 ); for( int i = 0; i < 8; i++ ) { for( int j = 0; j < 4; j++ ) if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * SIZEOF_PIXEL ) ) { ok = 0; fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); for( int k = 0; k < w; k++ ) fprintf( stderr, "%d ", dstc[j][k+i*stride_lowres] ); fprintf( stderr, "\n" ); for( int k = 0; k < w; k++ ) fprintf( stderr, "%d ", dsta[j][k+i*stride_lowres] ); fprintf( stderr, "\n" ); break; } } } report( "lowres init :" ); } #define INTEGRAL_INIT( name, size, offset, cmp_len, ... )\ if( mc_a.name != mc_ref.name )\ {\ intptr_t stride = 96;\ set_func_name( #name );\ used_asm = 1;\ memcpy( buf3, buf1, size*2*stride );\ memcpy( buf4, buf1, size*2*stride );\ uint16_t *sum = (uint16_t*)buf3;\ call_c1( mc_c.name, sum+offset, __VA_ARGS__ );\ sum = (uint16_t*)buf4;\ call_a1( mc_a.name, sum+offset, __VA_ARGS__ );\ if( memcmp( buf3+2*offset, buf4+2*offset, cmp_len*2 )\ || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\ ok = 0;\ call_c2( mc_c.name, sum+offset, __VA_ARGS__ );\ call_a2( mc_a.name, sum+offset, __VA_ARGS__ );\ } ok = 1; used_asm = 0; INTEGRAL_INIT( integral_init4h, 2, stride, stride-4, pbuf2, stride ); INTEGRAL_INIT( integral_init8h, 2, stride, stride-8, pbuf2, stride ); INTEGRAL_INIT( integral_init4v, 14, 0, stride-8, sum+9*stride, stride ); INTEGRAL_INIT( integral_init8v, 9, 0, stride-8, stride ); report( "integral init :" ); ok = 1; used_asm = 0; if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost ) { used_asm = 1; x264_emms(); for( int i = 0; i < 10; i++ ) { float fps_factor = (rand30()&65535) / 65535.0f; set_func_name( "mbtree_propagate_cost" ); int16_t *dsta = (int16_t*)buf3; int16_t *dstc = dsta+400; uint16_t *prop = (uint16_t*)buf1; uint16_t *intra = (uint16_t*)buf4; uint16_t *inter = intra+128; uint16_t *qscale = inter+128; uint16_t *rnd = (uint16_t*)buf2; x264_emms(); for( int j = 0; j < 100; j++ ) { intra[j] = *rnd++ & 0x7fff; intra[j] += !intra[j]; inter[j] = *rnd++ & 0x7fff; qscale[j] = *rnd++ & 0x7fff; } call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 ); call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 ); // I don't care about exact rounding, this is just how close the floating-point implementation happens to be x264_emms(); for( int j = 0; j < 100 && ok; j++ ) { ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; if( !ok ) fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] ); } } } if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list ) { used_asm = 1; for( int i = 0; i < 8; i++ ) { set_func_name( "mbtree_propagate_list" ); x264_t h; int height = 4; int width = 128; int size = width*height; h.mb.i_mb_stride = width; h.mb.i_mb_width = width; h.mb.i_mb_height = height; uint16_t *ref_costsc = (uint16_t*)buf3 + width; uint16_t *ref_costsa = (uint16_t*)buf4 + width; int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size); int16_t *propagate_amount = (int16_t*)(mvs + width); uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width); h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size); int bipred_weight = (rand()%63)+1; int mb_y = rand()&3; int list = i&1; for( int j = -width; j < size+width; j++ ) ref_costsc[j] = ref_costsa[j] = rand()&32767; for( int j = 0; j < width; j++ ) { static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}}; for( int k = 0; k < 2; k++ ) mvs[j][k] = (rand()&127) - 64; propagate_amount[j] = rand()&32767; lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT; } call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); for( int j = -width; j < size+width && ok; j++ ) { ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1; if( !ok ) fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] ); } call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); } } static const uint16_t mbtree_fix8_counts[] = { 5, 384, 392, 400, 415 }; if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack ) { set_func_name( "mbtree_fix8_pack" ); used_asm = 1; float *fix8_src = (float*)(buf3 + 0x800); uint16_t *dstc = (uint16_t*)buf3; uint16_t *dsta = (uint16_t*)buf4; for( int i = 0; i < ARRAY_ELEMS(mbtree_fix8_counts); i++ ) { int count = mbtree_fix8_counts[i]; for( int j = 0; j < count; j++ ) fix8_src[j] = (int16_t)(rand()) / 256.0f; dsta[count] = 0xAAAA; call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count ); call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count ); if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA ) { ok = 0; fprintf( stderr, "mbtree_fix8_pack FAILED\n" ); break; } } } if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack ) { set_func_name( "mbtree_fix8_unpack" ); used_asm = 1; uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800); float *dstc = (float*)buf3; float *dsta = (float*)buf4; for( int i = 0; i < ARRAY_ELEMS(mbtree_fix8_counts); i++ ) { int count = mbtree_fix8_counts[i]; for( int j = 0; j < count; j++ ) fix8_src[j] = rand(); M32( &dsta[count] ) = 0xAAAAAAAA; call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count ); call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count ); if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA ) { ok = 0; fprintf( stderr, "mbtree_fix8_unpack FAILED\n" ); break; } } } report( "mbtree :" ); if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned ) { set_func_name( "memcpy_aligned" ); ok = 1; used_asm = 1; for( size_t size = 16; size < 512; size += 16 ) { for( size_t i = 0; i < size; i++ ) buf1[i] = (uint8_t)rand(); memset( buf4-1, 0xAA, size + 2 ); call_c( mc_c.memcpy_aligned, buf3, buf1, size ); call_a( mc_a.memcpy_aligned, buf4, buf1, size ); if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA ) { ok = 0; fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size ); break; } } report( "memcpy aligned :" ); } if( mc_a.memzero_aligned != mc_ref.memzero_aligned ) { set_func_name( "memzero_aligned" ); ok = 1; used_asm = 1; for( size_t size = 128; size < 1024; size += 128 ) { memset( buf4-1, 0xAA, size + 2 ); call_c( mc_c.memzero_aligned, buf3, size ); call_a( mc_a.memzero_aligned, buf4, size ); if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA ) { ok = 0; fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size ); break; } } report( "memzero aligned :" ); } return ret; } static int check_deblock( uint32_t cpu_ref, uint32_t cpu_new ) { x264_deblock_function_t db_c; x264_deblock_function_t db_ref; x264_deblock_function_t db_a; int ret = 0, ok = 1, used_asm = 0; int alphas[36], betas[36]; int8_t tcs[36][4]; x264_deblock_init( 0, &db_c, 0 ); x264_deblock_init( cpu_ref, &db_ref, 0 ); x264_deblock_init( cpu_new, &db_a, 0 ); /* not exactly the real values of a,b,tc but close enough */ for( int i = 35, a = 255, c = 250; i >= 0; i-- ) { alphas[i] = a << (BIT_DEPTH-8); betas[i] = (i+1)/2 << (BIT_DEPTH-8); tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8); tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8); tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8); a = a*9/10; c = c*9/10; } #define TEST_DEBLOCK( name, align, ... ) \ for( int i = 0; i < 36; i++ ) \ { \ intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \ for( int j = 0; j < 1024; j++ ) \ /* two distributions of random to excersize different failure modes */ \ pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \ memcpy( pbuf4, pbuf3, 1024 * SIZEOF_PIXEL ); \ if( db_a.name != db_ref.name ) \ { \ set_func_name( #name ); \ used_asm = 1; \ call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ if( memcmp( pbuf3, pbuf4, 1024 * SIZEOF_PIXEL ) ) \ { \ ok = 0; \ fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \ break; \ } \ call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ } \ } TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] ); TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] ); TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] ); TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] ); TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] ); TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] ); TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] ); TEST_DEBLOCK( deblock_luma_intra[0], 0 ); TEST_DEBLOCK( deblock_luma_intra[1], 1 ); TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 ); TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 ); TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 ); TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 ); TEST_DEBLOCK( deblock_chroma_intra[1], 1 ); if( db_a.deblock_strength != db_ref.deblock_strength ) { set_func_name( "deblock_strength" ); used_asm = 1; for( int i = 0; i < 100; i++ ) { ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] ); uint8_t *nnz = &nnz_buf[8]; ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] ); memset( bs, 99, sizeof(uint8_t)*2*4*8*2 ); for( int j = 0; j < X264_SCAN8_SIZE; j++ ) nnz[j] = ((rand()&7) == 7) * rand() & 0xf; for( int j = 0; j < 2; j++ ) for( int k = 0; k < X264_SCAN8_LUMA_SIZE; k++ ) { ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2; for( int l = 0; l < 2; l++ ) mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192; } call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) ); call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) ); if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) ) { ok = 0; fprintf( stderr, "deblock_strength: [FAILED]\n" ); for( int j = 0; j < 2; j++ ) { for( int k = 0; k < 2; k++ ) for( int l = 0; l < 4; l++ ) { for( int m = 0; m < 4; m++ ) fprintf( stderr, "%d ",bs[j][k][l][m] ); fprintf( stderr, "\n" ); } fprintf( stderr, "\n" ); } break; } } } report( "deblock :" ); return ret; } static int check_quant( uint32_t cpu_ref, uint32_t cpu_new ) { x264_quant_function_t qf_c; x264_quant_function_t qf_ref; x264_quant_function_t qf_a; ALIGNED_ARRAY_64( dctcoef, dct1,[64] ); ALIGNED_ARRAY_64( dctcoef, dct2,[64] ); ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] ); ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] ); ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] ); int ret = 0, ok, used_asm; int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; x264_t h_buf; x264_t *h = &h_buf; memset( h, 0, sizeof(*h) ); h->sps->i_chroma_format_idc = 1; x264_param_default( &h->param ); h->chroma_qp_table = i_chroma_qp_table + 12; h->param.analyse.b_transform_8x8 = 1; static const uint8_t cqm_test4[16] = { 6,4,6,4, 4,3,4,3, 6,4,6,4, 4,3,4,3 }; static const uint8_t cqm_test8[64] = { 3,3,4,3,3,3,4,3, 3,3,4,3,3,3,4,3, 4,4,5,4,4,4,5,4, 3,3,4,3,3,3,4,3, 3,3,4,3,3,3,4,3, 3,3,4,3,3,3,4,3, 4,4,5,4,4,4,5,4, 3,3,4,3,3,3,4,3 }; for( int i_cqm = 0; i_cqm < 6; i_cqm++ ) { if( i_cqm == 0 ) { for( int i = 0; i < 8; i++ ) h->sps->scaling_list[i] = x264_cqm_flat16; h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_FLAT; } else if( i_cqm == 1 ) { for( int i = 0; i < 8; i++ ) h->sps->scaling_list[i] = x264_cqm_jvt[i]; h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_JVT; } else if( i_cqm == 2 ) { for( int i = 0; i < 4; i++ ) h->sps->scaling_list[i] = cqm_test4; for( int i = 4; i < 8; i++ ) h->sps->scaling_list[i] = x264_cqm_flat16; h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_CUSTOM; } else if( i_cqm == 3 ) { for( int i = 0; i < 4; i++ ) h->sps->scaling_list[i] = x264_cqm_flat16; for( int i = 4; i < 8; i++ ) h->sps->scaling_list[i] = cqm_test8; h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_CUSTOM; } else { int max_scale = BIT_DEPTH < 10 ? 255 : 228; if( i_cqm == 4 ) for( int i = 0; i < 64; i++ ) cqm_buf[i] = 10 + rand() % (max_scale - 9); else for( int i = 0; i < 64; i++ ) cqm_buf[i] = 1; for( int i = 0; i < 8; i++ ) h->sps->scaling_list[i] = cqm_buf; h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_CUSTOM; } h->param.rc.i_qp_min = 0; h->param.rc.i_qp_max = QP_MAX_SPEC; x264_cqm_init( h ); x264_quant_init( h, 0, &qf_c ); x264_quant_init( h, cpu_ref, &qf_ref ); x264_quant_init( h, cpu_new, &qf_a ); #define INIT_QUANT8(j,max) \ { \ static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \ for( int i = 0; i < max; i++ ) \ { \ int scale = (PIXEL_MAX*scale1d[(i>>3)&7]*scale1d[i&7])/16; \ dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand30()%(2*scale+1))-scale : 0; \ } \ } #define INIT_QUANT4(j,max) \ { \ static const int scale1d[4] = {4,6,4,6}; \ for( int i = 0; i < max; i++ ) \ { \ int scale = PIXEL_MAX*scale1d[(i>>2)&3]*scale1d[i&3]; \ dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand30()%(2*scale+1))-scale : 0; \ } \ } #define TEST_QUANT_DC( name, cqm ) \ if( qf_a.name != qf_ref.name ) \ { \ set_func_name( #name ); \ used_asms[0] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ for( int j = 0; j < 2; j++ ) \ { \ int result_c, result_a; \ for( int i = 0; i < 16; i++ ) \ dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \ result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ break; \ } \ call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ } \ } \ } #define TEST_QUANT( qname, block, type, w, maxj ) \ if( qf_a.qname != qf_ref.qname ) \ { \ set_func_name( #qname ); \ used_asms[0] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ for( int j = 0; j < maxj; j++ ) \ { \ INIT_QUANT##type(j, w*w) \ int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \ } \ } \ } TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 ); TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 ); TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 ); TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 ); TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 ); TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 ); TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] ); TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] ); #define TEST_DEQUANT( qname, dqname, block, w ) \ if( qf_a.dqname != qf_ref.dqname ) \ { \ set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \ used_asms[1] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ INIT_QUANT##w(1, w*w) \ qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \ { \ oks[1] = 0; \ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ break; \ } \ call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ } \ } TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 ); TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 ); TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 ); TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 ); #define TEST_DEQUANT_DC( qname, dqname, block, w ) \ if( qf_a.dqname != qf_ref.dqname ) \ { \ set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \ used_asms[1] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ for( int i = 0; i < 16; i++ ) \ dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \ qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \ { \ oks[1] = 0; \ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ } \ call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ } \ } TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 ); if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc ) { set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" ); used_asms[1] = 1; for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- ) { for( int i = 0; i < 8; i++ ) dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 ); call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 ); for( int i = 0; i < 8; i++ ) if( dct3[i][0] != dct4[i][0] ) { oks[1] = 0; fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); break; } } } if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly ) { set_func_name( "idct_dequant_2x4_dconly_%s", i_cqm?"cqm":"flat" ); used_asms[1] = 1; for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- ) { for( int i = 0; i < 8; i++ ) dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); memcpy( dct2, dct1, 8*sizeof(dctcoef) ); call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 ); call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 ); if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) ) { oks[1] = 0; fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); break; } call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 ); call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 ); } } #define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \ if( qf_a.optname != qf_ref.optname ) \ { \ set_func_name( #optname ); \ used_asms[2] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ int qpdc = qp + (size == 8 ? 3 : 0); \ int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \ if( dmf > 32*64 ) \ continue; \ for( int i = 16;; i <<= 1 ) \ { \ int res_c, res_asm; \ int max = X264_MIN( i, PIXEL_MAX*16 ); \ for( int j = 0; j < size; j++ ) \ dct1[j] = rand()%(max*2+1) - max; \ for( int j = 0; j <= size; j += 4 ) \ qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \ memcpy( dct2, dct1, size*sizeof(dctcoef) ); \ res_c = call_c1( qf_c.optname, dct1, dmf ); \ res_asm = call_a1( qf_a.optname, dct2, dmf ); \ if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \ { \ oks[2] = 0; \ fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \ } \ call_c2( qf_c.optname, dct1, dmf ); \ call_a2( qf_a.optname, dct2, dmf ); \ if( i >= PIXEL_MAX*16 ) \ break; \ } \ } \ } TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 ); TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 ); x264_cqm_delete( h ); } ok = oks[0]; used_asm = used_asms[0]; report( "quant :" ); ok = oks[1]; used_asm = used_asms[1]; report( "dequant :" ); ok = oks[2]; used_asm = used_asms[2]; report( "optimize chroma dc :" ); ok = 1; used_asm = 0; if( qf_a.denoise_dct != qf_ref.denoise_dct ) { used_asm = 1; for( int size = 16; size <= 64; size += 48 ) { set_func_name( "denoise_dct" ); memcpy( dct1, buf1, size*sizeof(dctcoef) ); memcpy( dct2, buf1, size*sizeof(dctcoef) ); memcpy( buf3+256, buf3, 256 ); call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size ); call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size ); if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) ) ok = 0; call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size ); call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size ); } } report( "denoise dct :" ); #define TEST_DECIMATE( decname, w, ac, thresh ) \ if( qf_a.decname != qf_ref.decname ) \ { \ set_func_name( #decname ); \ used_asm = 1; \ for( int i = 0; i < 100; i++ ) \ { \ static const int distrib[16] = {1,1,1,1,1,1,1,1,1,1,1,1,2,3,4};\ static const int zerorate_lut[4] = {3,7,15,31};\ int zero_rate = zerorate_lut[i&3];\ for( int idx = 0; idx < w*w; idx++ ) \ { \ int sign = (rand()&1) ? -1 : 1; \ int abs_level = distrib[rand()&15]; \ if( abs_level == 4 ) abs_level = rand()&0x3fff; \ int zero = !(rand()&zero_rate); \ dct1[idx] = zero * abs_level * sign; \ } \ if( ac ) \ dct1[0] = 0; \ int result_c = call_c( qf_c.decname, dct1 ); \ int result_a = call_a( qf_a.decname, dct1 ); \ if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \ { \ ok = 0; \ fprintf( stderr, #decname ": [FAILED]\n" ); \ break; \ } \ } \ } ok = 1; used_asm = 0; TEST_DECIMATE( decimate_score64, 8, 0, 6 ); TEST_DECIMATE( decimate_score16, 4, 0, 6 ); TEST_DECIMATE( decimate_score15, 4, 1, 7 ); report( "decimate_score :" ); #define TEST_LAST( last, lastname, size, ac ) \ if( qf_a.last != qf_ref.last ) \ { \ set_func_name( #lastname ); \ used_asm = 1; \ for( int i = 0; i < 100; i++ ) \ { \ int nnz = 0; \ int max = rand() & (size-1); \ memset( dct1, 0, 64*sizeof(dctcoef) ); \ for( int idx = ac; idx < max; idx++ ) \ nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \ if( !nnz ) \ dct1[ac] = 1; \ int result_c = call_c( qf_c.last, dct1+ac ); \ int result_a = call_a( qf_a.last, dct1+ac ); \ if( result_c != result_a ) \ { \ ok = 0; \ fprintf( stderr, #lastname ": [FAILED]\n" ); \ break; \ } \ } \ } ok = 1; used_asm = 0; TEST_LAST( coeff_last4 , coeff_last4, 4, 0 ); TEST_LAST( coeff_last8 , coeff_last8, 8, 0 ); TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 16, 1 ); TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 ); TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 ); report( "coeff_last :" ); #define TEST_LEVELRUN( lastname, name, size, ac ) \ if( qf_a.lastname != qf_ref.lastname ) \ { \ set_func_name( #name ); \ used_asm = 1; \ for( int i = 0; i < 100; i++ ) \ { \ x264_run_level_t runlevel_c, runlevel_a; \ int nnz = 0; \ int max = rand() & (size-1); \ memset( dct1, 0, 64*sizeof(dctcoef) ); \ memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \ memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \ for( int idx = ac; idx < max; idx++ ) \ nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \ if( !nnz ) \ dct1[ac] = 1; \ int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \ int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \ if( result_c != result_a || runlevel_c.last != runlevel_a.last || \ runlevel_c.mask != runlevel_a.mask || \ memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c)) \ { \ ok = 0; \ fprintf( stderr, #name ": [FAILED]\n" ); \ break; \ } \ } \ } ok = 1; used_asm = 0; TEST_LEVELRUN( coeff_level_run4 , coeff_level_run4, 4, 0 ); TEST_LEVELRUN( coeff_level_run8 , coeff_level_run8, 8, 0 ); TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 16, 1 ); TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 ); report( "coeff_level_run :" ); return ret; } static int check_intra( uint32_t cpu_ref, uint32_t cpu_new ) { int ret = 0, ok = 1, used_asm = 0; ALIGNED_ARRAY_32( pixel, edge,[36] ); ALIGNED_ARRAY_32( pixel, edge2,[36] ); ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] ); struct { x264_predict_t predict_16x16[4+3]; x264_predict_t predict_8x8c[4+3]; x264_predict_t predict_8x16c[4+3]; x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; x264_predict_8x8_filter_t predict_8x8_filter; } ip_c, ip_ref, ip_a; x264_predict_16x16_init( 0, ip_c.predict_16x16 ); x264_predict_8x8c_init( 0, ip_c.predict_8x8c ); x264_predict_8x16c_init( 0, ip_c.predict_8x16c ); x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter ); x264_predict_4x4_init( 0, ip_c.predict_4x4 ); x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 ); x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c ); x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c ); x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter ); x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 ); x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 ); x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c ); x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c ); x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter ); x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 ); memcpy( fdec, pbuf1, 32*20 * SIZEOF_PIXEL );\ ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); #define INTRA_TEST( name, dir, w, h, align, bench, ... )\ if( ip_a.name[dir] != ip_ref.name[dir] )\ {\ set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\ used_asm = 1;\ memcpy( pbuf3, fdec, FDEC_STRIDE*20 * SIZEOF_PIXEL );\ memcpy( pbuf4, fdec, FDEC_STRIDE*20 * SIZEOF_PIXEL );\ for( int a = 0; a < (do_bench ? 64/SIZEOF_PIXEL : 1); a += align )\ {\ call_c##bench( ip_c.name[dir], pbuf3+48+a, ##__VA_ARGS__ );\ call_a##bench( ip_a.name[dir], pbuf4+48+a, ##__VA_ARGS__ );\ if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * SIZEOF_PIXEL ) )\ {\ fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\ ok = 0;\ if( ip_c.name == (void *)ip_c.predict_8x8 )\ {\ for( int k = -1; k < 16; k++ )\ fprintf( stderr, FMT_PIXEL" ", edge[16+k] );\ fprintf( stderr, "\n" );\ }\ for( int j = 0; j < h; j++ )\ {\ if( ip_c.name == (void *)ip_c.predict_8x8 )\ fprintf( stderr, FMT_PIXEL" ", edge[14-j] );\ for( int k = 0; k < w; k++ )\ fprintf( stderr, FMT_PIXEL" ", pbuf4[48+k+j*FDEC_STRIDE] );\ fprintf( stderr, "\n" );\ }\ fprintf( stderr, "\n" );\ for( int j = 0; j < h; j++ )\ {\ if( ip_c.name == (void *)ip_c.predict_8x8 )\ fprintf( stderr, " " );\ for( int k = 0; k < w; k++ )\ fprintf( stderr, FMT_PIXEL" ", pbuf3[48+k+j*FDEC_STRIDE] );\ fprintf( stderr, "\n" );\ }\ break;\ }\ }\ } for( int i = 0; i < 12; i++ ) INTRA_TEST( predict_4x4, i, 4, 4, 4, ); for( int i = 0; i < 7; i++ ) INTRA_TEST( predict_8x8c, i, 8, 8, 16, ); for( int i = 0; i < 7; i++ ) INTRA_TEST( predict_8x16c, i, 8, 16, 16, ); for( int i = 0; i < 7; i++ ) INTRA_TEST( predict_16x16, i, 16, 16, 16, ); for( int i = 0; i < 12; i++ ) INTRA_TEST( predict_8x8, i, 8, 8, 8, , edge ); set_func_name("intra_predict_8x8_filter"); if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter ) { used_asm = 1; for( int i = 0; i < 32; i++ ) { if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) ) continue; int neighbor = (i&24)>>1; memset( edge, 0, 36*SIZEOF_PIXEL ); memset( edge2, 0, 36*SIZEOF_PIXEL ); call_c( ip_c.predict_8x8_filter, pbuf1+48, edge, neighbor, i&7 ); call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 ); if( !(neighbor&MB_TOPLEFT) ) edge[15] = edge2[15] = 0; if( memcmp( edge+7, edge2+7, (i&MB_TOPRIGHT ? 26 : i&MB_TOP ? 17 : 8) * SIZEOF_PIXEL ) ) { fprintf( stderr, "predict_8x8_filter : [FAILED] %d %d\n", (i&24)>>1, i&7); ok = 0; } } } #define EXTREMAL_PLANE( w, h ) \ { \ int max[7]; \ for( int j = 0; j < 7; j++ ) \ max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \ fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \ for( int j = 0; j < w/2; j++ ) \ fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \ for( int j = w/2; j < w-1; j++ ) \ fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \ fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \ for( int j = 0; j < h/2; j++ ) \ fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \ for( int j = h/2; j < h-1; j++ ) \ fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \ fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \ } /* Extremal test case for planar prediction. */ for( int test = 0; test < 100 && ok; test++ ) for( int i = 0; i < 128 && ok; i++ ) { EXTREMAL_PLANE( 8, 8 ); INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 8, 64, 1 ); EXTREMAL_PLANE( 8, 16 ); INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P, 8, 16, 64, 1 ); EXTREMAL_PLANE( 16, 16 ); INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 16, 64, 1 ); } report( "intra pred :" ); return ret; } #define DECL_CABAC(cpu) \ static void run_cabac_decision_##cpu( x264_t *h, uint8_t *dst )\ {\ x264_cabac_t cb;\ x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ for( int i = 0; i < 0x1000; i++ )\ x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\ }\ static void run_cabac_bypass_##cpu( x264_t *h, uint8_t *dst )\ {\ x264_cabac_t cb;\ x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ for( int i = 0; i < 0x1000; i++ )\ x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\ }\ static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\ {\ x264_cabac_t cb;\ x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\ x264_cabac_encode_init( &cb, dst, dst+0xff0 );\ for( int i = 0; i < 0x1000; i++ )\ x264_cabac_encode_terminal_##cpu( &cb );\ } DECL_CABAC(c) #if HAVE_MMX DECL_CABAC(asm) #elif HAVE_AARCH64 DECL_CABAC(asm) #else #define run_cabac_decision_asm run_cabac_decision_c #define run_cabac_bypass_asm run_cabac_bypass_c #define run_cabac_terminal_asm run_cabac_terminal_c #endif extern const uint8_t x264_count_cat_m1[14]; static int check_cabac( uint32_t cpu_ref, uint32_t cpu_new ) { int ret = 0, ok = 1, used_asm = 0; x264_t h; h.sps->i_chroma_format_idc = 3; x264_bitstream_function_t bs_ref; x264_bitstream_function_t bs_a; x264_bitstream_init( cpu_ref, &bs_ref ); x264_bitstream_init( cpu_new, &bs_a ); x264_quant_init( &h, cpu_new, &h.quantf ); h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4; /* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */ #define GET_CB( i ) (\ x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\ cb[i].f8_bits_encoded = 0, &cb[i] ) #define CABAC_RESIDUAL(name, start, end, rd)\ {\ if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\ {\ used_asm = 1;\ set_func_name( #name );\ for( int i = 0; i < 2; i++ )\ {\ for( intptr_t ctx_block_cat = start; ctx_block_cat <= end; ctx_block_cat++ )\ {\ for( int j = 0; j < 256; j++ )\ {\ ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\ uint8_t bitstream[2][1<<16];\ static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\ int ac = ctx_ac[ctx_block_cat];\ int nz = 0;\ while( !nz )\ {\ for( int k = 0; k <= x264_count_cat_m1[ctx_block_cat]; k++ )\ {\ /* Very rough distribution that covers possible inputs */\ int rnd = rand();\ int coef = !(rnd&3);\ coef += !(rnd& 15) * (rand()&0x0006);\ coef += !(rnd& 63) * (rand()&0x0008);\ coef += !(rnd& 255) * (rand()&0x00F0);\ coef += !(rnd&1023) * (rand()&0x7F00);\ nz |= dct[0][ac+k] = dct[1][ac+k] = coef * ((rand()&1) ? 1 : -1);\ }\ }\ h.mb.b_interlaced = i;\ x264_cabac_t cb[2];\ x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\ x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\ if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\ call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\ call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\ ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\ if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\ if( !ok )\ {\ fprintf( stderr, #name " : [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\ if( rd && cb[0].f8_bits_encoded != cb[1].f8_bits_encoded )\ fprintf( stderr, " (%d != %d)", cb[0].f8_bits_encoded, cb[1].f8_bits_encoded );\ fprintf( stderr, "\n");\ goto name##fail;\ }\ if( (j&15) == 0 )\ {\ call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\ call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\ }\ }\ }\ }\ }\ }\ name##fail: CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 ) report( "cabac residual:" ); ok = 1; used_asm = 0; CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 ) CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 ) report( "cabac residual rd:" ); if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm ) return ret; ok = 1; used_asm = 0; x264_cabac_init( &h ); set_func_name( "cabac_encode_decision" ); memcpy( buf4, buf3, 0x1000 ); call_c( run_cabac_decision_c, &h, buf3 ); call_a( run_cabac_decision_asm, &h, buf4 ); ok = !memcmp( buf3, buf4, 0x1000 ); report( "cabac decision:" ); set_func_name( "cabac_encode_bypass" ); memcpy( buf4, buf3, 0x1000 ); call_c( run_cabac_bypass_c, &h, buf3 ); call_a( run_cabac_bypass_asm, &h, buf4 ); ok = !memcmp( buf3, buf4, 0x1000 ); report( "cabac bypass:" ); set_func_name( "cabac_encode_terminal" ); memcpy( buf4, buf3, 0x1000 ); call_c( run_cabac_terminal_c, &h, buf3 ); call_a( run_cabac_terminal_asm, &h, buf4 ); ok = !memcmp( buf3, buf4, 0x1000 ); report( "cabac terminal:" ); return ret; } static int check_bitstream( uint32_t cpu_ref, uint32_t cpu_new ) { x264_bitstream_function_t bs_c; x264_bitstream_function_t bs_ref; x264_bitstream_function_t bs_a; int ret = 0, ok = 1, used_asm = 0; x264_bitstream_init( 0, &bs_c ); x264_bitstream_init( cpu_ref, &bs_ref ); x264_bitstream_init( cpu_new, &bs_a ); if( bs_a.nal_escape != bs_ref.nal_escape ) { int size = 0x4000; uint8_t *input = malloc(size+100); uint8_t *output1 = malloc(size*2); uint8_t *output2 = malloc(size*2); used_asm = 1; set_func_name( "nal_escape" ); for( int i = 0; i < 100; i++ ) { /* Test corner-case sizes */ int test_size = i < 10 ? i+1 : rand() & 0x3fff; /* Test 8 different probability distributions of zeros */ for( int j = 0; j < test_size+32; j++ ) input[j] = (uint8_t)((rand()&((1 << ((i&7)+1)) - 1)) * rand()); uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size ); uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size ); int size_c = end_c-output1; int size_a = end_a-output2; if( size_c != size_a || memcmp( output1, output2, size_c ) ) { fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a ); ok = 0; break; } } for( int j = 0; j < size+32; j++ ) input[j] = (uint8_t)rand(); call_c2( bs_c.nal_escape, output1, input, input+size ); call_a2( bs_a.nal_escape, output2, input, input+size ); free(input); free(output1); free(output2); } report( "nal escape:" ); return ret; } static int check_all_funcs( uint32_t cpu_ref, uint32_t cpu_new ) { return check_pixel( cpu_ref, cpu_new ) + check_dct( cpu_ref, cpu_new ) + check_mc( cpu_ref, cpu_new ) + check_intra( cpu_ref, cpu_new ) + check_deblock( cpu_ref, cpu_new ) + check_quant( cpu_ref, cpu_new ) + check_cabac( cpu_ref, cpu_new ) + check_bitstream( cpu_ref, cpu_new ); } static int add_flags( uint32_t *cpu_ref, uint32_t *cpu_new, uint32_t flags, const char *name ) { *cpu_ref = *cpu_new; *cpu_new |= flags; #if STACK_ALIGNMENT < 16 *cpu_new |= X264_CPU_STACK_MOD4; #endif if( *cpu_new & X264_CPU_SSE2_IS_FAST ) *cpu_new &= ~X264_CPU_SSE2_IS_SLOW; if( !quiet ) fprintf( stderr, "x264: %s\n", name ); return check_all_funcs( *cpu_ref, *cpu_new ); } static int check_all_flags( void ) { int ret = 0; uint32_t cpu0 = 0, cpu1 = 0; uint32_t cpu_detect = x264_cpu_detect(); #if HAVE_MMX if( cpu_detect & X264_CPU_AVX512 ) simd_warmup_func = x264_checkasm_warmup_avx512; else if( cpu_detect & X264_CPU_AVX ) simd_warmup_func = x264_checkasm_warmup_avx; #endif simd_warmup(); #if ARCH_AARCH64 && HAVE_SVE char buf[20]; #endif #if ARCH_X86 || ARCH_X86_64 if( cpu_detect & X264_CPU_MMX2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" ); cpu1 &= ~X264_CPU_CACHELINE_64; #if ARCH_X86 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif } if( cpu_detect & X264_CPU_SSE ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" ); if( cpu_detect & X264_CPU_SSE2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" ); cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" ); cpu1 &= ~X264_CPU_SLOW_SHUFFLE; } if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } if( cpu_detect & X264_CPU_SSE3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" ); cpu1 &= ~X264_CPU_CACHELINE_64; } if( cpu_detect & X264_CPU_SSSE3 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" ); cpu1 &= ~X264_CPU_SLOW_SHUFFLE; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" ); cpu1 &= ~X264_CPU_CACHELINE_64; cpu1 &= ~X264_CPU_SLOW_ATOM; if( cpu_detect & X264_CPU_LZCNT ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } } if( cpu_detect & X264_CPU_SSE4 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); if( cpu_detect & X264_CPU_SSE42 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" ); if( cpu_detect & X264_CPU_AVX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); if( cpu_detect & X264_CPU_XOP ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" ); if( cpu_detect & X264_CPU_FMA4 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" ); cpu1 &= ~X264_CPU_FMA4; } if( cpu_detect & X264_CPU_FMA3 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); if( cpu_detect & X264_CPU_BMI1 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); if( cpu_detect & X264_CPU_BMI2 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" ); if( cpu_detect & X264_CPU_AVX2 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); if( cpu_detect & X264_CPU_AVX512 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" ); #elif ARCH_PPC if( cpu_detect & X264_CPU_ALTIVEC ) { fprintf( stderr, "x264: ALTIVEC against C\n" ); ret = check_all_funcs( 0, X264_CPU_ALTIVEC ); } #elif ARCH_ARM if( cpu_detect & X264_CPU_NEON ) x264_checkasm_call = x264_checkasm_call_neon; if( cpu_detect & X264_CPU_ARMV6 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" ); if( cpu_detect & X264_CPU_NEON ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); if( cpu_detect & X264_CPU_FAST_NEON_MRC ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" ); #elif ARCH_AARCH64 if( cpu_detect & X264_CPU_ARMV8 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" ); if( cpu_detect & X264_CPU_NEON ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); if( cpu_detect & X264_CPU_DOTPROD ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_DOTPROD, "DOTPROD" ); if( cpu_detect & X264_CPU_I8MM ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_I8MM, "I8MM" ); #if HAVE_SVE if( cpu_detect & X264_CPU_SVE ) { snprintf( buf, sizeof( buf ), "SVE (%d bits)", x264_checkasm_sve_length() ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SVE, buf ); } if( cpu_detect & X264_CPU_SVE2 ) { snprintf( buf, sizeof( buf ), "SVE2 (%d bits)", x264_checkasm_sve_length() ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SVE2, buf ); } #endif #elif ARCH_MIPS if( cpu_detect & X264_CPU_MSA ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" ); #elif ARCH_LOONGARCH if( cpu_detect & X264_CPU_LSX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_LSX, "LSX" ); if( cpu_detect & X264_CPU_LASX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_LASX, "LASX" ); #endif return ret; } REALIGN_STACK int main( int argc, char **argv ) { #ifdef _WIN32 /* Disable the Windows Error Reporting dialog */ SetErrorMode( SEM_NOGPFAULTERRORBOX ); #endif if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) ) { #if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS && !ARCH_LOONGARCH fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" ); return 1; #endif do_bench = 1; if( argv[1][7] == '=' ) { bench_pattern = argv[1]+8; bench_pattern_len = strlen(bench_pattern); } argc--; argv++; } unsigned seed = ( argc > 1 ) ? strtoul(argv[1], NULL, 0) : (unsigned)x264_mdate(); fprintf( stderr, "x264: using random seed %u\n", seed ); srand( seed ); buf1 = x264_malloc( 0x1e00 + 0x2000*SIZEOF_PIXEL ); pbuf1 = x264_malloc( 0x1e00*SIZEOF_PIXEL ); if( !buf1 || !pbuf1 ) { fprintf( stderr, "malloc failed, unable to initiate tests!\n" ); return -1; } #define INIT_POINTER_OFFSETS\ buf2 = buf1 + 0xf00;\ buf3 = buf2 + 0xf00;\ buf4 = buf3 + 0x1000*SIZEOF_PIXEL;\ pbuf2 = pbuf1 + 0xf00;\ pbuf3 = (pixel*)buf3;\ pbuf4 = (pixel*)buf4; INIT_POINTER_OFFSETS; for( int i = 0; i < 0x1e00; i++ ) { buf1[i] = rand() & 0xFF; pbuf1[i] = rand() & PIXEL_MAX; } memset( buf1+0x1e00, 0, 0x2000*SIZEOF_PIXEL ); if( x264_stack_pagealign( check_all_flags, 0 ) ) { fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" ); return -1; } fprintf( stderr, "x264: All tests passed Yeah :)\n" ); if( do_bench ) print_bench(); return 0; } x264-master/tools/cltostr.sh000077500000000000000000000016261502133446700162550ustar00rootroot00000000000000#!/bin/sh # Convert standard input to a C char array, write to a file, then create an # MD5 sum of that file and append said MD5 sum as char array to the file. [ -n "$1" ] || exit 1 # Filter out whitespace, empty lines, and comments. sanitize() { sed 's/^[[:space:]]*//; /^$/d; /^\/\//d' } # Convert stdin to a \0-terminated char array. dump() { echo "static const char $1[] = {" od -v -A n -t x1 | sed 's/[[:space:]]*\([[:alnum:]]\{2\}\)/0x\1, /g' echo '0x00 };' } # Print MD5 hash w/o newline character to not embed the character in the array. hash() { # md5sum is not standard, so try different platform-specific alternatives. { md5sum "$1" || md5 -q "$1" || digest -a md5 "$1"; } 2>/dev/null | cut -b -32 | tr -d '\n\r' } trap 'rm -f "$1.temp"' EXIT sanitize | tee "$1.temp" | dump 'x264_opencl_source' > "$1" hash "$1.temp" | dump 'x264_opencl_source_hash' >> "$1" x264-master/tools/countquant_x264.pl000077500000000000000000000025241502133446700175460ustar00rootroot00000000000000#!/bin/env perl # countquant_x264.pl: displays statistics from x264 multipass logfiles # by Loren Merritt, 2005-4-5 @size{I,P,B} = @n{I,P,B} = (0)x3; sub proc_file { my $fh = shift; while(<$fh>) { /type:(.) q:(\d+\.\d+) tex:(\d+) mv:(\d+) misc:(\d+)/ or next; $type = uc $1; $n{$type} ++; $q[int($2+.5)] ++; $avgq += $2; $avgq{$type} += $2; my $bytes = ($3+$4+$5)/8; $size{$type} += $bytes; } $size = $size{I} + $size{P} + $size{B}; $n = $n{I} + $n{P} + $n{B}; $n or die "unrecognized input\n"; } if(@ARGV) { foreach(@ARGV) { open $fh, "<", $_ or die "can't open '$_': $!"; proc_file($fh); } } else { proc_file(STDIN); } for(0..51) { $q[$_] or next; printf "q%2d: %6d %4.1f%%\n", $_, $q[$_], 100*$q[$_]/$n; } print "\n"; $digits = int(log($n+1)/log(10))+2; printf "All: %${digits}d %s avgQP:%5.2f avgBytes:%5d\n", $n, $n==$n{I}?" ":"", $avgq/$n, $size/$n; foreach(qw(I P B S)) { $n{$_} or next; printf "%s: %${digits}d (%4.1f%%) avgQP:%5.2f avgBytes:%5d\n", $_, $n{$_}, 100*$n{$_}/$n, $avgq{$_}/$n{$_}, $size{$_}/$n{$_}; } print "\n"; printf "total size: $size B = %.2f KiB = %.2f MiB\n", $size/2**10, $size/2**20; print "bitrate: ", join("\n = ", map sprintf("%.2f kbps @ %s fps", $_*$size*8/1000/$n, $_), 23.976, 25, 29.97), "\n"; x264-master/tools/digress/000077500000000000000000000000001502133446700156575ustar00rootroot00000000000000x264-master/tools/digress/__init__.py000066400000000000000000000003341502133446700177700ustar00rootroot00000000000000""" Automated regression/unit testing suite. """ __version__ = '0.2' def digress(fixture): """ Command-line helper for Digress. """ from digress.cli import Dispatcher Dispatcher(fixture).dispatch() x264-master/tools/digress/cli.py000066400000000000000000000104751502133446700170070ustar00rootroot00000000000000""" Digress's CLI interface. """ import inspect import sys from optparse import OptionParser import textwrap from types import MethodType from digress import __version__ as version def dispatchable(func): """ Mark a method as dispatchable. """ func.digress_dispatchable = True return func class Dispatcher(object): """ Dispatcher for CLI commands. """ def __init__(self, fixture): self.fixture = fixture fixture.dispatcher = self def _monkey_print_help(self, optparse, *args, **kwargs): # monkey patches OptionParser._print_help OptionParser.print_help(optparse, *args, **kwargs) print >>sys.stderr, "\nAvailable commands:" maxlen = max([ len(command_name) for command_name in self.commands ]) descwidth = 80 - maxlen - 4 for command_name, command_meth in self.commands.iteritems(): print >>sys.stderr, " %s %s\n" % ( command_name.ljust(maxlen + 1), ("\n" + (maxlen + 4) * " ").join( textwrap.wrap(" ".join(filter( None, command_meth.__doc__.strip().replace("\n", " ").split(" ") )), descwidth ) ) ) def _enable_flush(self): self.fixture.flush_before = True def _populate_parser(self): self.commands = self._get_commands() self.optparse = OptionParser( usage = "usage: %prog [options] command [args]", description = "Digress CLI frontend for %s." % self.fixture.__class__.__name__, version = "Digress %s" % version ) self.optparse.print_help = MethodType(self._monkey_print_help, self.optparse, OptionParser) self.optparse.add_option( "-f", "--flush", action="callback", callback=lambda option, opt, value, parser: self._enable_flush(), help="flush existing data for a revision before testing" ) self.optparse.add_option( "-c", "--cases", metavar="FOO,BAR", action="callback", dest="cases", type=str, callback=lambda option, opt, value, parser: self._select_cases(*value.split(",")), help="test cases to run, run with command list to see full list" ) def _select_cases(self, *cases): self.fixture.cases = filter(lambda case: case.__name__ in cases, self.fixture.cases) def _get_commands(self): commands = {} for name, member in inspect.getmembers(self.fixture): if hasattr(member, "digress_dispatchable"): commands[name] = member return commands def _run_command(self, name, *args): if name not in self.commands: print >>sys.stderr, "error: %s is not a valid command\n" % name self.optparse.print_help() return command = self.commands[name] argspec = inspect.getargspec(command) max_arg_len = len(argspec.args) - 1 min_arg_len = max_arg_len - ((argspec.defaults is not None) and len(argspec.defaults) or 0) if len(args) < min_arg_len: print >>sys.stderr, "error: %s takes at least %d arguments\n" % ( name, min_arg_len ) print >>sys.stderr, "%s\n" % command.__doc__ self.optparse.print_help() return if len(args) > max_arg_len: print >>sys.stderr, "error: %s takes at most %d arguments\n" % ( name, max_arg_len ) print >>sys.stderr, "%s\n" % command.__doc__ self.optparse.print_help() return command(*args) def pre_dispatch(self): pass def dispatch(self): self._populate_parser() self.optparse.parse_args() self.pre_dispatch() args = self.optparse.parse_args()[1] # arguments may require reparsing after pre_dispatch; see test_x264.py if len(args) == 0: print >>sys.stderr, "error: no command specified\n" self.optparse.print_help() return command = args[0] addenda = args[1:] self._run_command(command, *addenda) x264-master/tools/digress/comparers.py000066400000000000000000000035121502133446700202250ustar00rootroot00000000000000""" Digress comparers. """ from digress.errors import ComparisonError import os from itertools import imap, izip def compare_direct(value_a, value_b): if value_a != value_b: raise ComparisonError("%s is not %s" % (value_a, value_b)) def compare_pass(value_a, value_b): """ Always true, as long as the test is passed. """ def compare_tolerance(tolerance): def _compare_tolerance(value_a, value_b): if abs(value_a - value_b) > tolerance: raise ComparisonError("%s is not %s (tolerance: %s)" % ( value_a, value_b, tolerance )) return _compare_tolerance def compare_files(file_a, file_b): size_a = os.path.getsize(file_a) size_b = os.path.getsize(file_b) print file_a, file_b if size_a != size_b: raise ComparisonError("%s is not the same size as %s" % ( file_a, file_b )) BUFFER_SIZE = 8196 offset = 0 with open(file_a) as f_a: with open(file_b) as f_b: for chunk_a, chunk_b in izip( imap( lambda i: f_a.read(BUFFER_SIZE), xrange(size_a // BUFFER_SIZE + 1) ), imap( lambda i: f_b.read(BUFFER_SIZE), xrange(size_b // BUFFER_SIZE + 1) ) ): chunk_size = len(chunk_a) if chunk_a != chunk_b: for i in xrange(chunk_size): if chunk_a[i] != chunk_b[i]: raise ComparisonError("%s differs from %s at offset %d" % ( file_a, file_b, offset + i )) offset += chunk_size x264-master/tools/digress/constants.py000066400000000000000000000002441502133446700202450ustar00rootroot00000000000000""" All of Digress's constants. """ TEST_PASS = 0 TEST_FAIL = 1 TEST_DISABLED = 2 TEST_SKIPPED = 3 CASE_PASS = 0 CASE_FAIL = 1 FIXTURE_PASS = 0 FIXTURE_FAIL = 1 x264-master/tools/digress/errors.py000066400000000000000000000020511502133446700175430ustar00rootroot00000000000000""" Digress errors. """ class DigressError(Exception): """ Digress error base class. """ class NoSuchTestError(DigressError): """ Raised when no such test exists. """ class DisabledTestError(DigressError): """ Test is disabled. """ class SkippedTestError(DigressError): """ Test is marked as skipped. """ class DisabledCaseError(DigressError): """ Case is marked as disabled. """ class SkippedCaseError(DigressError): """ Case is marked as skipped. """ class FailedTestError(DigressError): """ Test failed. """ class ComparisonError(DigressError): """ Comparison failed. """ class IncomparableError(DigressError): """ Values cannot be compared. """ class AlreadyRunError(DigressError): """ Test/case has already been run. """ class SCMError(DigressError): """ Error occurred in SCM. """ def __init__(self, message): self.message = message.replace("\n", " ") def __str__(self): return self.message x264-master/tools/digress/scm/000077500000000000000000000000001502133446700164415ustar00rootroot00000000000000x264-master/tools/digress/scm/__init__.py000066400000000000000000000000551502133446700205520ustar00rootroot00000000000000""" Source control backends for Digress. """ x264-master/tools/digress/scm/dummy.py000066400000000000000000000011541502133446700201470ustar00rootroot00000000000000""" Dummy SCM backend for Digress. """ from random import random def checkout(revision): """ Checkout a revision. """ pass def current_rev(): """ Get the current revision """ return str(random()) def revisions(rev_a, rev_b): """ Get a list of revisions from one to another. """ pass def stash(): """ Stash the repository. """ pass def unstash(): """ Unstash the repository. """ pass def bisect(command, revision): """ Perform a bisection. """ raise NotImplementedError("dummy SCM backend does not support bisection") x264-master/tools/digress/scm/git.py000066400000000000000000000052471502133446700176060ustar00rootroot00000000000000""" Git SCM backend for Digress. """ from subprocess import Popen, PIPE, STDOUT import re from digress.errors import SCMError GIT_BRANCH_EXPR = re.compile("[*] (.*)") def checkout(revision): """ Checkout a revision from git. """ proc = Popen([ "git", "checkout", "-f", revision ], stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0].strip() if proc.returncode != 0: raise SCMError("checkout error: %s" % output) def rev_parse(ref): proc = Popen([ "git", "rev-parse", ref ], stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0].strip() if proc.returncode != 0: raise SCMError("rev-parse error: %s" % output) return output def current_rev(): """ Get the current revision. """ return rev_parse("HEAD") def current_branch(): """ Get the current branch. """ proc = Popen([ "git", "branch", "--no-color" ], stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0].strip() if proc.returncode != 0: raise SCMError("branch error: %s" % output) branch_name = GIT_BRANCH_EXPR.findall(output)[0] return branch_name != "(no branch)" and branch_name or None def revisions(rev_a, rev_b): """ Get a list of revisions from one to another. """ proc = Popen([ "git", "log", "--format=%H", ("%s...%s" % (rev_a, rev_b)) ], stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0].strip() if proc.returncode != 0: raise SCMError("log error: %s" % output) return output.split("\n") def stash(): """ Stash the repository. """ proc = Popen([ "git", "stash", "save", "--keep-index" ], stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0].strip() if proc.returncode != 0: raise SCMError("stash error: %s" % output) def unstash(): """ Unstash the repository. """ proc = Popen(["git", "stash", "pop"], stdout=PIPE, stderr=STDOUT) proc.communicate() def bisect(*args): """ Perform a bisection. """ proc = Popen((["git", "bisect"] + list(args)), stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0] if proc.returncode != 0: raise SCMError("bisect error: %s" % output) return output def dirty(): """ Check if the working tree is dirty. """ proc = Popen(["git", "status"], stdout=PIPE, stderr=STDOUT) output = proc.communicate()[0].strip() if proc.returncode != 0: raise SCMError("status error: %s" % output) if "modified:" in output: return True else: return False x264-master/tools/digress/testing.py000066400000000000000000000440121502133446700177070ustar00rootroot00000000000000""" Digress testing core. """ from digress.errors import SkippedTestError, DisabledTestError, NoSuchTestError, \ FailedTestError, AlreadyRunError, SCMError, \ ComparisonError from digress.constants import * from digress.cli import dispatchable import inspect import operator import os import json import textwrap from shutil import rmtree from time import time from functools import wraps from itertools import izip_longest from hashlib import sha1 class depends(object): """ Dependency decorator for a test. """ def __init__(self, *test_names): self.test_names = test_names def __call__(self, func): func.digress_depends = self.test_names return func class _skipped(object): """ Internal skipped decorator. """ def __init__(self, reason=""): self._reason = reason def __call__(self, func): @wraps(func) def _closure(*args): raise SkippedTestError(self._reason) return _closure class disabled(object): """ Disable a test, with reason. """ def __init__(self, reason=""): self._reason = reason def __call__(self, func): @wraps(func) def _closure(*args): raise DisabledTestError(self._reason) return _closure class comparer(object): """ Set the comparer for a test. """ def __init__(self, comparer_): self._comparer = comparer_ def __call__(self, func): func.digress_comparer = self._comparer return func class Fixture(object): cases = [] scm = None flush_before = False def _skip_case(self, case, depend): for name, meth in inspect.getmembers(case): if name[:5] == "test_": setattr( case, name, _skipped("failed dependency: case %s" % depend)(meth) ) def _run_case(self, case, results): if case.__name__ in results: raise AlreadyRunError for depend in case.depends: if depend.__name__ in results and results[depend.__name__]["status"] != CASE_PASS: self._skip_case(case, depend.__name__) try: result = self._run_case(depend, results) except AlreadyRunError: continue if result["status"] != CASE_PASS: self._skip_case(case, depend.__name__) result = case().run() results[case.__name__] = result return result @dispatchable def flush(self, revision=None): """ Flush any cached results. Takes a revision for an optional argument. """ if not revision: print "Flushing all cached results...", try: rmtree(".digress_%s" % self.__class__.__name__) except Exception, e: print "failed: %s" % e else: print "done." else: try: rev = self.scm.rev_parse(revision) except SCMError, e: print e else: print "Flushing cached results for %s..." % rev, try: rmtree(os.path.join(".digress_%s" % self.__class__.__name__, rev)) except Exception, e: print "failed: %s" % e else: print "done." @dispatchable def run(self, revision=None): """ Run the fixture for a specified revision. Takes a revision for an argument. """ oldrev = None oldbranch = None dirty = False try: dirty = self.scm.dirty() # if the tree is clean, then we don't need to make an exception if not dirty and revision is None: revision = "HEAD" if revision: oldrev = self.scm.current_rev() oldbranch = self.scm.current_branch() if dirty: self.scm.stash() self.scm.checkout(revision) rev = self.scm.current_rev() self.datastore = os.path.join(".digress_%s" % self.__class__.__name__, rev) if os.path.isdir(self.datastore): if self.flush_before: self.flush(rev) else: os.makedirs(self.datastore) else: rev = "(dirty working tree)" self.datastore = None print "Running fixture %s on revision %s...\n" % (self.__class__.__name__, rev) results = {} for case in self.cases: try: self._run_case(case, results) except AlreadyRunError: continue total_time = reduce(operator.add, filter( None, [ result["time"] for result in results.values() ] ), 0) overall_status = ( CASE_FAIL in [ result["status"] for result in results.values() ] ) and FIXTURE_FAIL or FIXTURE_PASS print "Fixture %s in %.4f.\n" % ( (overall_status == FIXTURE_PASS) and "passed" or "failed", total_time ) return { "cases" : results, "time" : total_time, "status" : overall_status, "revision" : rev } finally: if oldrev: self.scm.checkout(oldrev) if oldbranch: self.scm.checkout(oldbranch) if dirty: self.scm.unstash() @dispatchable def bisect(self, good_rev, bad_rev=None): """ Perform a bisection between two revisions. First argument is the good revision, second is the bad revision, which defaults to the current revision. """ if not bad_rev: bad_rev = self.scm.current_rev() dirty = False # get a set of results for the good revision good_result = self.run(good_rev) good_rev = good_result["revision"] try: dirty = self.scm.dirty() if dirty: self.scm.stash() self.scm.bisect("start") self.scm.bisect("bad", bad_rev) self.scm.bisect("good", good_rev) bisecting = True isbad = False while bisecting: results = self.run(self.scm.current_rev()) revision = results["revision"] # perform comparisons # FIXME: this just uses a lot of self.compare for case_name, case_result in good_result["cases"].iteritems(): case = filter(lambda case: case.__name__ == case_name, self.cases)[0] for test_name, test_result in case_result["tests"].iteritems(): test = filter( lambda pair: pair[0] == "test_%s" % test_name, inspect.getmembers(case) )[0][1] other_result = results["cases"][case_name]["tests"][test_name] if other_result["status"] == TEST_FAIL and case_result["status"] != TEST_FAIL: print "Revision %s failed %s.%s." % (revision, case_name, test_name) isbad = True break elif hasattr(test, "digress_comparer"): try: test.digress_comparer(test_result["value"], other_result["value"]) except ComparisonError, e: print "%s differs: %s" % (test_name, e) isbad = True break if isbad: output = self.scm.bisect("bad", revision) print "Marking revision %s as bad." % revision else: output = self.scm.bisect("good", revision) print "Marking revision %s as good." % revision if output.split("\n")[0].endswith("is the first bad commit"): print "\nBisection complete.\n" print output bisecting = False print "" except SCMError, e: print e finally: self.scm.bisect("reset") if dirty: self.scm.unstash() @dispatchable def multicompare(self, rev_a=None, rev_b=None, mode="waterfall"): """ Generate a comparison of tests. Takes three optional arguments, from which revision, to which revision, and the method of display (defaults to vertical "waterfall", also accepts "river" for horizontal display) """ if not rev_a: rev_a = self.scm.current_rev() if not rev_b: rev_b = self.scm.current_rev() revisions = self.scm.revisions(rev_a, rev_b) results = [] for revision in revisions: results.append(self.run(revision)) test_names = reduce(operator.add, [ [ (case_name, test_name) for test_name, test_result in case_result["tests"].iteritems() ] for case_name, case_result in results[0]["cases"].iteritems() ], []) MAXLEN = 20 colfmt = "| %s " table = [] if mode not in ("waterfall", "river"): mode = "waterfall" print "Unknown multicompare mode specified, defaulting to %s." % mode if mode == "waterfall": header = [ "Test" ] for result in results: header.append(result["revision"]) table.append(header) for test_name in test_names: row_data = [ ".".join(test_name) ] for result in results: test_result = result["cases"][test_name[0]]["tests"][test_name[1]] if test_result["status"] != TEST_PASS: value = "did not pass: %s" % (test_result["value"]) else: value = "%s (%.4f)" % (test_result["value"], test_result["time"]) row_data.append(value) table.append(row_data) elif mode == "river": header = [ "Revision" ] for test_name in test_names: header.append(".".join(test_name)) table.append(header) for result in results: row_data = [ result["revision"] ] for case_name, case_result in result["cases"].iteritems(): for test_name, test_result in case_result["tests"].iteritems(): if test_result["status"] != TEST_PASS: value = "did not pass: %s" % (test_result["value"]) else: value = "%s (%.4f)" % (test_result["value"], test_result["time"]) row_data.append(value) table.append(row_data) breaker = "=" * (len(colfmt % "".center(MAXLEN)) * len(table[0]) + 1) print breaker for row in table: for row_stuff in izip_longest(*[ textwrap.wrap(col, MAXLEN, break_on_hyphens=False) for col in row ], fillvalue=""): row_output = "" for col in row_stuff: row_output += colfmt % col.ljust(MAXLEN) row_output += "|" print row_output print breaker @dispatchable def compare(self, rev_a, rev_b=None): """ Compare two revisions directly. Takes two arguments, second is optional and implies current revision. """ results_a = self.run(rev_a) results_b = self.run(rev_b) for case_name, case_result in results_a["cases"].iteritems(): case = filter(lambda case: case.__name__ == case_name, self.cases)[0] header = "Comparison of case %s" % case_name print header print "=" * len(header) for test_name, test_result in case_result["tests"].iteritems(): test = filter( lambda pair: pair[0] == "test_%s" % test_name, inspect.getmembers(case) )[0][1] other_result = results_b["cases"][case_name]["tests"][test_name] if test_result["status"] != TEST_PASS or other_result["status"] != TEST_PASS: print "%s cannot be compared as one of the revisions have not passed it." % test_name elif hasattr(test, "digress_comparer"): try: test.digress_comparer(test_result["value"], other_result["value"]) except ComparisonError, e: print "%s differs: %s" % (test_name, e) else: print "%s does not differ." % test_name else: print "%s has no comparer and therefore cannot be compared." % test_name print "" @dispatchable def list(self): """ List all available test cases, excluding dependencies. """ print "\nAvailable Test Cases" print "====================" for case in self.cases: print case.__name__ def register_case(self, case): case.fixture = self self.cases.append(case) class Case(object): depends = [] fixture = None def _get_test_by_name(self, test_name): if not hasattr(self, "test_%s" % test_name): raise NoSuchTestError(test_name) return getattr(self, "test_%s" % test_name) def _run_test(self, test, results): test_name = test.__name__[5:] if test_name in results: raise AlreadyRunError if hasattr(test, "digress_depends"): for depend in test.digress_depends: if depend in results and results[depend]["status"] != TEST_PASS: test = _skipped("failed dependency: %s" % depend)(test) dependtest = self._get_test_by_name(depend) try: result = self._run_test(dependtest, results) except AlreadyRunError: continue if result["status"] != TEST_PASS: test = _skipped("failed dependency: %s" % depend)(test) start_time = time() run_time = None print "Running test %s..." % test_name, try: if not self.datastore: # XXX: this smells funny raise IOError with open(os.path.join( self.datastore, "%s.json" % sha1(test_name).hexdigest() ), "r") as f: result = json.load(f) value = str(result["value"]) if result["status"] == TEST_DISABLED: status = "disabled" elif result["status"] == TEST_SKIPPED: status = "skipped" elif result["status"] == TEST_FAIL: status = "failed" elif result["status"] == TEST_PASS: status = "passed" value = "%s (in %.4f)" % ( result["value"] or "(no result)", result["time"] ) else: status = "???" print "%s (cached): %s" % (status, value) except IOError: try: value = test() except DisabledTestError, e: print "disabled: %s" % e status = TEST_DISABLED value = str(e) except SkippedTestError, e: print "skipped: %s" % e status = TEST_SKIPPED value = str(e) except FailedTestError, e: print "failed: %s" % e status = TEST_FAIL value = str(e) except Exception, e: print "failed with exception: %s" % e status = TEST_FAIL value = str(e) else: run_time = time() - start_time print "passed: %s (in %.4f)" % ( value or "(no result)", run_time ) status = TEST_PASS result = { "status" : status, "value" : value, "time" : run_time } if self.datastore: with open(os.path.join( self.datastore, "%s.json" % sha1(test_name).hexdigest() ), "w") as f: json.dump(result, f) results[test_name] = result return result def run(self): print "Running case %s..." % self.__class__.__name__ if self.fixture.datastore: self.datastore = os.path.join( self.fixture.datastore, sha1(self.__class__.__name__).hexdigest() ) if not os.path.isdir(self.datastore): os.makedirs(self.datastore) else: self.datastore = None results = {} for name, meth in inspect.getmembers(self): if name[:5] == "test_": try: self._run_test(meth, results) except AlreadyRunError: continue total_time = reduce(operator.add, filter( None, [ result["time"] for result in results.values() ] ), 0) overall_status = ( TEST_FAIL in [ result["status"] for result in results.values() ] ) and CASE_FAIL or CASE_PASS print "Case %s in %.4f.\n" % ( (overall_status == FIXTURE_PASS) and "passed" or "failed", total_time ) return { "tests" : results, "time" : total_time, "status" : overall_status } x264-master/tools/gas-preprocessor.pl000077500000000000000000001300141502133446700200540ustar00rootroot00000000000000#!/usr/bin/env perl # by David Conrad # This code is licensed under GPLv2 or later; go to gnu.org to read it # (not that it much matters for an asm preprocessor) # usage: set your assembler to be something like "perl gas-preprocessor.pl gcc" use strict; # Apple's gas is ancient and doesn't support modern preprocessing features like # .rept and has ugly macro syntax, among other things. Thus, this script # implements the subset of the gas preprocessor used by x264 and ffmpeg # that isn't supported by Apple's gas. my %canonical_arch = ("aarch64" => "aarch64", "arm64" => "aarch64", "arm" => "arm", "powerpc" => "powerpc", "ppc" => "powerpc"); my %comments = ("aarch64" => '//', "arm" => '@', "ppc" => '#', "powerpc" => '#'); my @gcc_cmd; my @preprocess_c_cmd; my $comm; my $arch; my $as_type = "apple-gas"; my $fix_unreq = $^O eq "darwin"; my $force_thumb = 0; my $verbose = 0; my $arm_cond_codes = "eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le|al|hs|lo"; my $usage_str = " $0\n Gas-preprocessor.pl converts assembler files using modern GNU as syntax for Apple's ancient gas version or clang's incompatible integrated assembler. The conversion is regularly tested for FFmpeg, Libav, x264 and vlc. Other projects might use different features which are not correctly handled. Options for this program needs to be separated with ' -- ' from the assembler command. Following options are currently supported: -help - this usage text -arch - target architecture -as-type - one value out of {{,apple-}{gas,clang},armasm} -fix-unreq -no-fix-unreq -force-thumb - assemble as thumb regardless of the input source (note, this is incomplete and only works for sources it explicitly was tested with) -verbose - print executed commands "; sub usage() { print $usage_str; } while (@ARGV) { my $opt = shift; if ($opt =~ /^-(no-)?fix-unreq$/) { $fix_unreq = $1 ne "no-"; } elsif ($opt eq "-force-thumb") { $force_thumb = 1; } elsif ($opt eq "-verbose") { $verbose = 1; } elsif ($opt eq "-arch") { $arch = shift; die "unknown arch: '$arch'\n" if not exists $canonical_arch{$arch}; } elsif ($opt eq "-as-type") { $as_type = shift; die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang|llvm_gcc)|armasm)$/; } elsif ($opt eq "-help") { usage(); exit 0; } elsif ($opt eq "--" ) { @gcc_cmd = @ARGV; } elsif ($opt =~ /^-/) { die "option '$opt' is not known. See '$0 -help' for usage information\n"; } else { push @gcc_cmd, $opt, @ARGV; } last if (@gcc_cmd); } if (grep /\.c$/, @gcc_cmd) { # C file (inline asm?) - compile @preprocess_c_cmd = (@gcc_cmd, "-S"); } elsif (grep /\.[sS]$/, @gcc_cmd) { # asm file, just do C preprocessor @preprocess_c_cmd = (@gcc_cmd, "-E"); } elsif (grep /-(v|h|-version|dumpversion)/, @gcc_cmd) { # pass -v/--version along, used during probing. Matching '-v' might have # uninteded results but it doesn't matter much if gas-preprocessor or # the compiler fails. print STDERR join(" ", @gcc_cmd)."\n" if $verbose; exec(@gcc_cmd); } else { die "Unrecognized input filetype"; } if ($as_type eq "armasm") { $preprocess_c_cmd[0] = "cpp"; # Remove -ignore XX parameter pairs from preprocess_c_cmd my $index = 1; while ($index < $#preprocess_c_cmd) { if ($preprocess_c_cmd[$index] eq "-ignore" and $index + 1 < $#preprocess_c_cmd) { splice(@preprocess_c_cmd, $index, 2); next; } $index++; } if (grep /^-MM$/, @preprocess_c_cmd) { push(@preprocess_c_cmd, "-D_WIN32"); # Normally a preprocessor for windows would predefine _WIN32, # but we're using any generic system-agnostic preprocessor "cpp" # with -undef (to avoid getting predefined variables from the host # system in cross compilation cases), so manually define it here. # We only use this generic preprocessor for generating dependencies, # if the build system runs preprocessing with -M/-MM without -MF. push(@preprocess_c_cmd, "-undef"); @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd; print STDERR join(" ", @preprocess_c_cmd)."\n" if $verbose; system(@preprocess_c_cmd) == 0 or die "Error running preprocessor"; exit 0; } # If not preprocessing for getting a dependency list, use cl.exe # instead. $preprocess_c_cmd[0] = "cl.exe"; } # if compiling, avoid creating an output file named '-.o' if ((grep /^-c$/, @gcc_cmd) && !(grep /^-o/, @gcc_cmd)) { foreach my $i (@gcc_cmd) { if ($i =~ /\.[csS]$/) { my $outputfile = $i; $outputfile =~ s/\.[csS]$/.o/; push(@gcc_cmd, "-o"); push(@gcc_cmd, $outputfile); last; } } } # Remove the -o argument; if omitted, we by default preprocess to stdout. my $index = 1; while ($index < $#preprocess_c_cmd) { if ($preprocess_c_cmd[$index] eq "-o") { splice(@preprocess_c_cmd, $index, 2); last; } $index++; } @preprocess_c_cmd = grep ! /^-c$/, @preprocess_c_cmd; my $tempfile; if ($as_type ne "armasm") { @gcc_cmd = map { /\.[csS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd; # Filter out options that can cause warnings due to unused arguments, # Clang warns about unused -D parameters when invoked with "-x assembler". @gcc_cmd = grep ! /^-D/, @gcc_cmd; } else { @preprocess_c_cmd = grep ! /^-m/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-G/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-W/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-Z/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-fp/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-EHsc$/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-O/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-oldit/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-FS/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-w/, @preprocess_c_cmd; @preprocess_c_cmd = grep ! /^-M/, @preprocess_c_cmd; @gcc_cmd = grep ! /^-G/, @gcc_cmd; @gcc_cmd = grep ! /^-W/, @gcc_cmd; @gcc_cmd = grep ! /^-Z/, @gcc_cmd; @gcc_cmd = grep ! /^-fp/, @gcc_cmd; @gcc_cmd = grep ! /^-EHsc$/, @gcc_cmd; @gcc_cmd = grep ! /^-O/, @gcc_cmd; @gcc_cmd = grep ! /^-FS/, @gcc_cmd; @gcc_cmd = grep ! /^-w/, @gcc_cmd; my @outfiles = grep /\.(o|obj)$/, @gcc_cmd; $tempfile = $outfiles[0].".asm"; # Remove most parameters from gcc_cmd, which actually is the armasm command, # which doesn't support any of the common compiler/preprocessor options. @gcc_cmd = grep ! /^-D/, @gcc_cmd; @gcc_cmd = grep ! /^-U/, @gcc_cmd; @gcc_cmd = grep ! /^-m/, @gcc_cmd; @gcc_cmd = grep ! /^-M/, @gcc_cmd; @gcc_cmd = grep ! /^-c$/, @gcc_cmd; @gcc_cmd = grep ! /^-I/, @gcc_cmd; @gcc_cmd = map { /\.S$/ ? $tempfile : $_ } @gcc_cmd; } # detect architecture from gcc binary name if (!$arch) { if ($gcc_cmd[0] =~ /(arm64|aarch64|arm|powerpc|ppc)/) { $arch = $1; } else { # look for -arch flag foreach my $i (1 .. $#gcc_cmd-1) { if ($gcc_cmd[$i] eq "-arch" and $gcc_cmd[$i+1] =~ /(arm64|aarch64|arm|powerpc|ppc)/) { $arch = $1; } } } } # assume we're not cross-compiling if no -arch or the binary doesn't have the arch name $arch = qx/arch/ if (!$arch); # remove any whitespace, e.g. arch command might print a newline $arch =~ s/\s+//g; die "Unknown target architecture '$arch'" if not exists $canonical_arch{$arch}; $arch = $canonical_arch{$arch}; $comm = $comments{$arch}; my $inputcomm = $comm; $comm = ";" if $as_type =~ /armasm/; my %ppc_spr = (ctr => 9, vrsave => 256); print STDERR join(" ", @preprocess_c_cmd)."\n" if $verbose; open(INPUT, "-|", @preprocess_c_cmd) || die "Error running preprocessor"; if ($ENV{GASPP_DEBUG}) { open(ASMFILE, ">&STDOUT"); } else { if ($as_type ne "armasm") { print STDERR join(" ", @gcc_cmd)."\n" if $verbose; open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler"; } else { open(ASMFILE, ">", $tempfile); } } my $current_macro = ''; my $macro_level = 0; my $rept_level = 0; my %macro_lines; my %macro_args; my %macro_args_default; my $macro_count = 0; my $altmacro = 0; my $in_irp = 0; my $num_repts; my @rept_lines; my @irp_args; my $irp_param; my @ifstack; my %symbols; my @sections; my %literal_labels; # for ldr , = my $literal_num = 0; my $literal_expr = ".word"; $literal_expr = ".quad" if $arch eq "aarch64"; my $thumb = 0; my %thumb_labels; my %call_targets; my %import_symbols; my %neon_alias_reg; my %neon_alias_type; my $temp_label_next = 0; my %last_temp_labels; my %next_temp_labels; my %labels_seen; my %aarch64_req_alias; if ($force_thumb) { parse_line(".thumb\n"); } if ($as_type eq "armasm") { parse_line(".text\n"); } # pass 1: parse .macro # note that the handling of arguments is probably overly permissive vs. gas # but it should be the same for valid cases while () { # remove lines starting with '#', preprocessing is done, '#' at start of # the line indicates a comment for all supported archs (aarch64, arm, ppc # and x86). Also strips line number comments but since they are off anyway # it is no loss. s/^\s*#.*$//; # remove all comments (to avoid interfering with evaluating directives) s/(? 0) { $ifstack[-1] = -$ifstack[-1]; } return 1; } elsif ($line =~ /\.else/) { $ifstack[-1] = !$ifstack[-1]; return 1; } elsif (handle_if($line)) { return 1; } } # discard lines in false .if blocks foreach my $i (0 .. $#ifstack) { if ($ifstack[$i] <= 0) { return 1; } } } return 0; } my $last_line = ""; sub parse_line_continued { my $line = $_[0]; $last_line .= $line; if ($last_line =~ /\\$/) { $last_line =~ s/\\$//; } else { # Add newlines at the end of lines after concatenation. $last_line .= "\n"; parse_line($last_line); $last_line = ""; } } sub parse_line { my $line = $_[0]; return if (parse_if_line($line)); if (scalar(@rept_lines) == 0) { if ($line =~ /\.macro/) { $macro_level++; if ($macro_level > 1 && !$current_macro) { die "nested macros but we don't have master macro"; } } elsif ($line =~ /\.endm/) { $macro_level--; if ($macro_level < 0) { die "unmatched .endm"; } elsif ($macro_level == 0) { $current_macro = ''; return; } } } if ($macro_level == 0) { if ($line =~ /\.(rept|irp)/) { $rept_level++; } elsif ($line =~ /.endr/) { $rept_level--; } } if ($macro_level > 1) { push(@{$macro_lines{$current_macro}}, $line); } elsif (scalar(@rept_lines) and $rept_level >= 1) { push(@rept_lines, $line); } elsif ($macro_level == 0) { expand_macros($line); } else { if ($line =~ /\.macro\s+([\d\w\.]+)\s*,?\s*(.*)/) { $current_macro = $1; # commas in the argument list are optional, so only use whitespace as the separator my $arglist = $2; $arglist =~ s/,/ /g; my @args = split(/\s+/, $arglist); foreach my $i (0 .. $#args) { my @argpair = split(/=/, $args[$i]); $macro_args{$current_macro}[$i] = $argpair[0]; $argpair[0] =~ s/:vararg$//; $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1]; } # ensure %macro_lines has the macro name added as a key $macro_lines{$current_macro} = []; } elsif ($current_macro) { push(@{$macro_lines{$current_macro}}, $line); } else { die "macro level without a macro name"; } } } sub handle_set { my $line = $_[0]; if ($line =~ /\.(?:set|equ)\s+(\S*)\s*,\s*(.*)/) { $symbols{$1} = eval_expr($2); return 1; } return 0; } sub expand_macros { my $line = $_[0]; # handle .if directives; apple's assembler doesn't support important non-basic ones # evaluating them is also needed to handle recursive macros if (handle_if($line)) { return; } if (/\.purgem\s+([\d\w\.]+)/) { delete $macro_lines{$1}; delete $macro_args{$1}; delete $macro_args_default{$1}; return; } if ($line =~ /\.altmacro/) { $altmacro = 1; return; } if ($line =~ /\.noaltmacro/) { $altmacro = 0; return; } $line =~ s/\%([^,]*)/eval_expr($1)/eg if $altmacro; # Strip out the .set lines from the armasm output return if (handle_set($line) and $as_type eq "armasm"); if ($line =~ /\.rept\s+(.*)/) { $num_repts = $1; @rept_lines = ("\n"); # handle the possibility of repeating another directive on the same line # .endr on the same line is not valid, I don't know if a non-directive is if ($num_repts =~ s/(\.\w+.*)//) { push(@rept_lines, "$1\n"); } $num_repts = eval_expr($num_repts); } elsif ($line =~ /\.irp\s+([\d\w\.]+)\s*(.*)/) { $in_irp = 1; $num_repts = 1; @rept_lines = ("\n"); $irp_param = $1; # only use whitespace as the separator my $irp_arglist = $2; $irp_arglist =~ s/,/ /g; $irp_arglist =~ s/^\s+//; @irp_args = split(/\s+/, $irp_arglist); } elsif ($line =~ /\.irpc\s+([\d\w\.]+)\s*(.*)/) { $in_irp = 1; $num_repts = 1; @rept_lines = ("\n"); $irp_param = $1; my $irp_arglist = $2; $irp_arglist =~ s/,/ /g; $irp_arglist =~ s/^\s+//; @irp_args = split(//, $irp_arglist); } elsif ($line =~ /\.endr/) { my @prev_rept_lines = @rept_lines; my $prev_in_irp = $in_irp; my @prev_irp_args = @irp_args; my $prev_irp_param = $irp_param; my $prev_num_repts = $num_repts; @rept_lines = (); $in_irp = 0; @irp_args = ''; if ($prev_in_irp != 0) { foreach my $i (@prev_irp_args) { foreach my $origline (@prev_rept_lines) { my $line = $origline; $line =~ s/\\$prev_irp_param/$i/g; $line =~ s/\\\(\)//g; # remove \() parse_line($line); } } } else { for (1 .. $prev_num_repts) { foreach my $origline (@prev_rept_lines) { my $line = $origline; parse_line($line); } } } } elsif ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) { handle_serialized_line($1); my $macro = $2; # commas are optional here too, but are syntactically important because # parameters can be blank my @arglist = split(/,/, $3); my @args; my @args_seperator; my $comma_sep_required = 0; foreach (@arglist) { # allow arithmetic/shift operators in macro arguments $_ =~ s/\s*(\+|-|\*|\/|<<|>>|<|>)\s*/$1/g; my @whitespace_split = split(/\s+/, $_); if (!@whitespace_split) { push(@args, ''); push(@args_seperator, ''); } else { foreach (@whitespace_split) { #print ("arglist = \"$_\"\n"); if (length($_)) { push(@args, $_); my $sep = $comma_sep_required ? "," : " "; push(@args_seperator, $sep); #print ("sep = \"$sep\", arg = \"$_\"\n"); $comma_sep_required = 0; } } } $comma_sep_required = 1; } my %replacements; if ($macro_args_default{$macro}){ %replacements = %{$macro_args_default{$macro}}; } # construct hashtable of text to replace foreach my $i (0 .. $#args) { my $argname = $macro_args{$macro}[$i]; my @macro_args = @{ $macro_args{$macro} }; if ($args[$i] =~ m/=/) { # arg=val references the argument name # XXX: I'm not sure what the expected behaviour if a lot of # these are mixed with unnamed args my @named_arg = split(/=/, $args[$i]); $replacements{$named_arg[0]} = $named_arg[1]; } elsif ($i > $#{$macro_args{$macro}}) { # more args given than the macro has named args # XXX: is vararg allowed on arguments before the last? $argname = $macro_args{$macro}[-1]; if ($argname =~ s/:vararg$//) { #print "macro = $macro, args[$i] = $args[$i], args_seperator=@args_seperator, argname = $argname, arglist[$i] = $arglist[$i], arglist = @arglist, args=@args, macro_args=@macro_args\n"; #$replacements{$argname} .= ", $args[$i]"; $replacements{$argname} .= "$args_seperator[$i] $args[$i]"; } else { die "Too many arguments to macro $macro"; } } else { $argname =~ s/:vararg$//; $replacements{$argname} = $args[$i]; } } my $count = $macro_count++; # apply replacements as regex foreach (@{$macro_lines{$macro}}) { my $macro_line = $_; # do replacements by longest first, this avoids wrong replacement # when argument names are subsets of each other foreach (reverse sort {length $a <=> length $b} keys %replacements) { $macro_line =~ s/\\$_/$replacements{$_}/g; } if ($altmacro) { foreach (reverse sort {length $a <=> length $b} keys %replacements) { $macro_line =~ s/\b$_\b/$replacements{$_}/g; } } $macro_line =~ s/\\\@/$count/g; $macro_line =~ s/\\\(\)//g; # remove \() parse_line($macro_line); } } else { handle_serialized_line($line); } } sub is_arm_register { my $name = $_[0]; if ($name eq "lr" or $name eq "ip" or $name =~ /^[rav]\d+$/) { return 1; } return 0; } sub is_aarch64_register { my $name = $_[0]; if ($name =~ /^[xw]\d+$/) { return 1; } return 0; } sub handle_local_label { my $line = $_[0]; my $num = $_[1]; my $dir = $_[2]; my $target = "$num$dir"; if ($dir eq "b") { $line =~ s/\b$target\b/$last_temp_labels{$num}/g; } else { my $name = "temp_label_$temp_label_next"; $temp_label_next++; push(@{$next_temp_labels{$num}}, $name); $line =~ s/\b$target\b/$name/g; } return $line; } sub handle_serialized_line { my $line = $_[0]; # handle .previous (only with regard to .section not .subsection) if ($line =~ /\.(section|text|const_data)/) { push(@sections, $line); } elsif ($line =~ /\.previous/) { if (!$sections[-2]) { die ".previous without a previous section"; } $line = $sections[-2]; push(@sections, $line); } $thumb = 1 if $line =~ /\.code\s+16|\.thumb/; $thumb = 0 if $line =~ /\.code\s+32|\.arm/; # handle ldr , = if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/ and $as_type ne "armasm") { my $label = $literal_labels{$3}; if (!$label) { $label = "Literal_$literal_num"; $literal_num++; $literal_labels{$3} = $label; } $line = "$1 ldr$2, $label\n"; } elsif ($line =~ /\.ltorg/ and $as_type ne "armasm") { $line .= ".align 2\n"; foreach my $literal (keys %literal_labels) { $line .= "$literal_labels{$literal}:\n $literal_expr $literal\n"; } %literal_labels = (); } # handle GNU as pc-relative relocations for adrp/add if ($line =~ /(.*)\s*adrp([\w\s\d]+)\s*,\s*#?:pg_hi21:([^\s]+)/ and $as_type =~ /^apple-/) { $line = "$1 adrp$2, ${3}\@PAGE\n"; } elsif ($line =~ /(.*)\s*add([\w\s\d]+)\s*,([\w\s\d]+)\s*,\s*#?:lo12:([^\s]+)/ and $as_type =~ /^apple-/) { $line = "$1 add$2, $3, ${4}\@PAGEOFF\n"; } # thumb add with large immediate needs explicit add.w if ($thumb and $line =~ /add\s+.*#([^@]+)/) { $line =~ s/add/add.w/ if eval_expr($1) > 255; } # mach-o local symbol names start with L (no dot) # armasm also can't handle labels that start with a dot. if ($as_type =~ /apple-/ or $as_type eq "armasm") { $line =~ s/(? with ic as conditional code if ($cond =~ /^(|$arm_cond_codes)$/) { if (exists $thumb_labels{$label}) { print ASMFILE ".thumb_func $label\n"; } else { $call_targets{$label}++; } } } # @l -> lo16() @ha -> ha16() $line =~ s/,\s+([^,]+)\@l\b/, lo16($1)/g; $line =~ s/,\s+([^,]+)\@ha\b/, ha16($1)/g; # move to/from SPR if ($line =~ /(\s+)(m[ft])([a-z]+)\s+(\w+)/ and exists $ppc_spr{$3}) { if ($2 eq 'mt') { $line = "$1${2}spr $ppc_spr{$3}, $4\n"; } else { $line = "$1${2}spr $4, $ppc_spr{$3}\n"; } } if ($line =~ /\.unreq\s+(.*)/) { if (defined $neon_alias_reg{$1}) { delete $neon_alias_reg{$1}; delete $neon_alias_type{$1}; return; } elsif (defined $aarch64_req_alias{$1}) { delete $aarch64_req_alias{$1}; return; } } # old gas versions store upper and lower case names on .req, # but they remove only one on .unreq if ($fix_unreq) { if ($line =~ /\.unreq\s+(.*)/) { $line = ".unreq " . lc($1) . "\n"; $line .= ".unreq " . uc($1) . "\n"; } } if ($line =~ /(\w+)\s+\.(dn|qn)\s+(\w+)(?:\.(\w+))?(\[\d+\])?/) { $neon_alias_reg{$1} = "$3$5"; $neon_alias_type{$1} = $4; return; } if (scalar keys %neon_alias_reg > 0 && $line =~ /^\s+v\w+/) { # This line seems to possibly have a neon instruction foreach (keys %neon_alias_reg) { my $alias = $_; # Require the register alias to match as an individual word, not as a substring # of a larger word-token. if ($line =~ /\b$alias\b/) { $line =~ s/\b$alias\b/$neon_alias_reg{$alias}/g; # Add the type suffix. If multiple aliases match on the same line, # only do this replacement the first time (a vfoo.bar string won't match v\w+). $line =~ s/^(\s+)(v\w+)(\s+)/$1$2.$neon_alias_type{$alias}$3/; } } } if ($arch eq "aarch64" or $as_type eq "armasm") { # clang's integrated aarch64 assembler in Xcode 5 does not support .req/.unreq if ($line =~ /\b(\w+)\s+\.req\s+(\w+)\b/) { $aarch64_req_alias{$1} = $2; return; } foreach (keys %aarch64_req_alias) { my $alias = $_; # recursively resolve aliases my $resolved = $aarch64_req_alias{$alias}; while (defined $aarch64_req_alias{$resolved}) { $resolved = $aarch64_req_alias{$resolved}; } $line =~ s/\b$alias\b/$resolved/g; } } if ($arch eq "aarch64") { # fix missing aarch64 instructions in Xcode 5.1 (beta3) # mov with vector arguments is not supported, use alias orr instead if ($line =~ /^(\d+:)?\s*mov\s+(v\d[\.{}\[\]\w]+),\s*(v\d[\.{}\[\]\w]+)\b\s*$/) { $line = "$1 orr $2, $3, $3\n"; } # movi 16, 32 bit shifted variant, shift is optional if ($line =~ /^(\d+:)?\s*movi\s+(v[0-3]?\d\.(?:2|4|8)[hsHS])\s*,\s*(#\w+)\b\s*$/) { $line = "$1 movi $2, $3, lsl #0\n"; } # Xcode 5 misses the alias uxtl. Replace it with the more general ushll. # Clang 3.4 misses the alias sxtl too. Replace it with the more general sshll. # armasm64 also misses these instructions. if ($line =~ /^(\d+:)?\s*(s|u)xtl(2)?\s+(v[0-3]?\d\.[248][hsdHSD])\s*,\s*(v[0-3]?\d\.(?:2|4|8|16)[bhsBHS])\b\s*$/) { $line = "$1 $2shll$3 $4, $5, #0\n"; } # clang 3.4 and armasm64 do not automatically use shifted immediates in add/sub if (($as_type eq "clang" or $as_type eq "armasm") and $line =~ /^(\d+:)?(\s*(?:add|sub)s?) ([^#l]+)#([\d\+\-\*\/ <>]+)\s*$/) { my $imm = eval $4; if ($imm > 4095 and not ($imm & 4095)) { $line = "$1 $2 $3#" . ($imm >> 12) . ", lsl #12\n"; } } if ($ENV{GASPP_FIX_XCODE5}) { if ($line =~ /^\s*bsl\b/) { $line =~ s/\b(bsl)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/; $line =~ s/\b(v[0-3]?\d)\.$3\b/$1/g; } if ($line =~ /^\s*saddl2?\b/) { $line =~ s/\b(saddl2?)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/; $line =~ s/\b(v[0-3]?\d)\.\w+\b/$1/g; } if ($line =~ /^\s*dup\b.*\]$/) { $line =~ s/\bdup(\s+v[0-3]?\d)\.(\w+)\b/dup.$2$1/g; $line =~ s/\b(v[0-3]?\d)\.[bhsdBHSD](\[\d\])$/$1$2/g; } } } if ($as_type eq "armasm") { # Also replace variables set by .set foreach (keys %symbols) { my $sym = $_; $line =~ s/\b$sym\b/$symbols{$sym}/g; } # Handle function declarations and keep track of the declared labels if ($line =~ s/^\s*\.func\s+(\w+)/$1 PROC/) { $labels_seen{$1} = 1; } if ($line =~ s/^\s*(\d+)://) { # Convert local labels into unique labels. armasm (at least in # RVCT) has something similar, but still different enough. # By converting to unique labels we avoid any possible # incompatibilities. my $num = $1; foreach (@{$next_temp_labels{$num}}) { $line = "$_\n" . $line; } @next_temp_labels{$num} = (); my $name = "temp_label_$temp_label_next"; $temp_label_next++; # The matching regexp above removes the label from the start of # the line (which might contain an instruction as well), re-add # it on a separate line above it. $line = "$name:\n" . $line; $last_temp_labels{$num} = $name; } if ($line =~ s/^\s*(\w+):/$1/) { # Skip labels that have already been declared with a PROC, # labels must not be declared multiple times. return if (defined $labels_seen{$1}); $labels_seen{$1} = 1; } elsif ($line !~ /(\w+) PROC/) { # If not a label, make sure the line starts with whitespace, # otherwise ms armasm interprets it incorrectly. $line =~ s/^[\.\w]/\t$&/; } # Check branch instructions if ($line =~ /(?:^|\n)\s*(\w+\s*:\s*)?(bl?x?\.?([^\s]{2})?(\.w)?)\s+(\w+)/) { my $instr = $2; my $cond = $3; my $width = $4; my $target = $5; # Don't interpret e.g. bic as b with ic as conditional code if ($cond !~ /^(|$arm_cond_codes)$/) { # Not actually a branch } elsif ($target =~ /^(\d+)([bf])$/) { # The target is a local label $line = handle_local_label($line, $1, $2); $line =~ s/\b$instr\b/$&.w/ if $width eq "" and $arch eq "arm"; } elsif (($arch eq "arm" and !is_arm_register($target)) or ($arch eq "aarch64" and !is_aarch64_register($target))) { $call_targets{$target}++; } } elsif ($line =~ /(?:^|\n)\s*(\w+\s*:\s*)?(cbn?z|adr|tbn?z)\s+(\w+)\s*,(\s*#\d+\s*,)?\s*(\w+)/) { my $instr = $2; my $reg = $3; my $bit = $4; my $target = $5; if ($target =~ /^(\d+)([bf])$/) { # The target is a local label $line = handle_local_label($line, $1, $2); } else { $call_targets{$target}++; } # Convert tbz with a wX register into an xX register, # due to armasm64 bugs/limitations. if (($instr eq "tbz" or $instr eq "tbnz") and $reg =~ /w\d+/) { my $xreg = $reg; $xreg =~ s/w/x/; $line =~ s/\b$reg\b/$xreg/; } } elsif ($line =~ /^\s*.([hxd]?word|quad).*\b\d+[bf]\b/) { while ($line =~ /\b(\d+)([bf])\b/g) { $line = handle_local_label($line, $1, $2); } } # ALIGN in armasm syntax is the actual number of bytes if ($line =~ /\.(?:p2)?align\s+(\d+)/) { my $align = 1 << $1; $line =~ s/\.(?:p2)?align\s+(\d+)/ALIGN $align/; } # Convert gas style [r0, :128] into armasm [r0@128] alignment specification $line =~ s/\[([^\[,]+),?\s*:(\d+)\]/[$1\@$2]/g; # armasm treats logical values {TRUE} and {FALSE} separately from # numeric values - logical operators and values can't be intermixed # with numerical values. Evaluate ! and (a <> b) into numbers, # let the assembler evaluate the rest of the expressions. This current # only works for cases when ! and <> are used with actual constant numbers, # we don't evaluate subexpressions here. # Evaluate ! while ($line =~ /!\s*(\d+)/g) { my $val = ($1 != 0) ? 0 : 1; $line =~ s/!(\d+)/$val/; } # Evaluate (a > b) while ($line =~ /\(\s*(\d+)\s*([<>])\s*(\d+)\s*\)/) { my $val; if ($2 eq "<") { $val = ($1 < $3) ? 1 : 0; } else { $val = ($1 > $3) ? 1 : 0; } $line =~ s/\(\s*(\d+)\s*([<>])\s*(\d+)\s*\)/$val/; } if ($arch eq "arm") { # Change a movw... #:lower16: into a mov32 pseudoinstruction $line =~ s/^(\s*)movw(\s+\w+\s*,\s*)\#:lower16:(.*)$/$1mov32$2$3/; # and remove the following, matching movt completely $line =~ s/^\s*movt\s+\w+\s*,\s*\#:upper16:.*$//; if ($line =~ /^\s*mov32\s+\w+,\s*([a-zA-Z]\w*)/) { $import_symbols{$1}++; } # Misc bugs/deficiencies: # armasm seems unable to parse e.g. "vmov s0, s1" without a type # qualifier, thus add .f32. $line =~ s/^(\s+(?:vmov|vadd))(\s+s\d+\s*,\s*s\d+)/$1.f32$2/; } elsif ($arch eq "aarch64") { # Convert ext into ext8; armasm64 seems to require it named as ext8. $line =~ s/^(\s+)ext(\s+)/$1ext8$2/; # Pick up targets from ldr x0, =sym+offset if ($line =~ /^\s*ldr\s+(\w+)\s*,\s*=([a-zA-Z]\w*)(.*)$/) { my $reg = $1; my $sym = $2; my $offset = eval_expr($3); if ($offset < 0 and $ENV{GASPP_ARMASM64_SKIP_NEG_OFFSET}) { # armasm64 in VS < 15.6 is buggy with ldr x0, =sym+offset where the # offset is a negative value; it does write a negative # offset into the literal pool as it should, but the # negative offset only covers the lower 32 bit of the 64 # bit literal/relocation. # Thus remove the offset and apply it manually with a sub # afterwards. $offset = -$offset; $line = "\tldr $reg, =$sym\n\tsub $reg, $reg, #$offset\n"; } $import_symbols{$sym}++; } # armasm64 (currently) doesn't support offsets on adrp targets, # even though the COFF format relocations (and the linker) # supports it. Therefore strip out the offsets from adrp and # add :lo12: (in case future armasm64 would start handling it) # and add an extra explicit add instruction for the offset. if ($line =~ s/(adrp\s+\w+\s*,\s*(\w+))([\d\+\-\*\/\(\) <>]+)?/\1/) { $import_symbols{$2}++; } if ($line =~ s/(add\s+(\w+)\s*,\s*\w+\s*,\s*):lo12:(\w+)([\d\+\-\*\/\(\) <>]+)?/\1\3/) { my $reg = $2; my $sym = $3; my $offset = eval_expr($4); $line .= "\tadd $reg, $reg, #$offset\n" if $offset > 0; $import_symbols{$sym}++; } # Convert e.g. "add x0, x0, w0, uxtw" into "add x0, x0, w0, uxtw #0", # or "ldr x0, [x0, w0, uxtw]" into "ldr x0, [x0, w0, uxtw #0]". $line =~ s/(uxt[whb]|sxt[whb])(\s*\]?\s*)$/\1 #0\2/i; # Convert "mov x0, v0.d[0]" into "umov x0, v0.d[0]" $line =~ s/\bmov\s+[xw]\d+\s*,\s*v\d+\.[ds]/u$&/i; # Convert "ccmp w0, #0, #0, ne" into "ccmpne w0, #0, #0", # and "csel w0, w0, w0, ne" into "cselne w0, w0, w0". $line =~ s/(ccmp|csel)\s+([xw]\w+)\s*,\s*([xw#]\w+)\s*,\s*([xw#]\w+)\s*,\s*($arm_cond_codes)/\1\5 \2, \3, \4/; # Convert "cinc w0, w0, ne" into "cincne w0, w0". $line =~ s/(cinc)\s+([xw]\w+)\s*,\s*([xw]\w+)\s*,\s*($arm_cond_codes)/\1\4 \2, \3/; # Convert "cset w0, lo" into "csetlo w0" $line =~ s/(cset)\s+([xw]\w+)\s*,\s*($arm_cond_codes)/\1\3 \2/; if ($ENV{GASPP_ARMASM64_SKIP_PRFUM}) { # Strip out prfum; armasm64 (VS < 15.5) fails to assemble any # variant/combination of prfum tested so far, but since it is # a prefetch instruction it can be skipped without changing # results. $line =~ s/prfum.*\]//; } # Convert "ldrb w0, [x0, #-1]" into "ldurb w0, [x0, #-1]". # Don't do this for forms with writeback though. if ($line =~ /(ld|st)(r[bh]?)\s+(\w+)\s*,\s*\[\s*(\w+)\s*,\s*#([^\]]+)\s*\][^!]/) { my $instr = $1; my $suffix = $2; my $target = $3; my $base = $4; my $offset = eval_expr($5); if ($offset < 0) { $line =~ s/$instr$suffix/${instr}u$suffix/; } } if ($ENV{GASPP_ARMASM64_INVERT_SCALE}) { # Instructions like fcvtzs and scvtf store the scale value # inverted in the opcode (stored as 64 - scale), but armasm64 # in VS < 15.5 stores it as-is. Thus convert from # "fcvtzs w0, s0, #8" into "fcvtzs w0, s0, #56". if ($line =~ /(?:fcvtzs|scvtf)\s+(\w+)\s*,\s*(\w+)\s*,\s*#(\d+)/) { my $scale = $3; my $inverted_scale = 64 - $3; $line =~ s/#$scale/#$inverted_scale/; } } # Convert "ld1 {v0.4h-v3.4h}" into "ld1 {v0.4h,v1.4h,v2.4h,v3.4h}" if ($line =~ /(\{\s*v(\d+)\.(\d+[bhsdBHSD])\s*-\s*v(\d+)\.(\d+[bhsdBHSD])\s*\})/) { my $regspec = $1; my $reg1 = $2; my $layout1 = $3; my $reg2 = $4; my $layout2 = $5; if ($layout1 eq $layout2) { my $new_regspec = "{"; foreach my $i ($reg1 .. $reg2) { $new_regspec .= "," if ($i > $reg1); $new_regspec .= "v$i.$layout1"; } $new_regspec .= "}"; $line =~ s/$regspec/$new_regspec/; } } } # armasm is unable to parse &0x - add spacing $line =~ s/&0x/& 0x/g; } if ($force_thumb) { # Convert register post indexing to a separate add instruction. # This converts e.g. "ldr r0, [r1], r2" into "ldr r0, [r1]", # "add r1, r1, r2". $line =~ s/((?:ldr|str)[bh]?)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g; # Convert "mov pc, lr" into "bx lr", since the former only works # for switching from arm to thumb (and only in armv7), but not # from thumb to arm. $line =~ s/mov\s*pc\s*,\s*lr/bx lr/g; # Convert stmdb/ldmia/stmfd/ldmfd/ldm with only one register into a plain str/ldr with post-increment/decrement. # Wide thumb2 encoding requires at least two registers in register list while all other encodings support one register too. $line =~ s/stm(?:db|fd)\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g; $line =~ s/ldm(?:ia|fd)?\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g; # Convert muls into mul+cmp $line =~ s/muls\s+(\w+),\s*(\w+)\,\s*(\w+)/mul $1, $2, $3\n\tcmp $1, #0/g; # Convert "and r0, sp, #xx" into "mov r0, sp", "and r0, r0, #xx" $line =~ s/and\s+(\w+),\s*(sp|r13)\,\s*#(\w+)/mov $1, $2\n\tand $1, $1, #$3/g; # Convert "ldr r0, [r0, r1, lsl #6]" where the shift is >3 (which # can't be handled in thumb) into "add r0, r0, r1, lsl #6", # "ldr r0, [r0]", for the special case where the same address is # used as base and target for the ldr. if ($line =~ /(ldr[bh]?)\s+(\w+),\s*\[\2,\s*(\w+),\s*lsl\s*#(\w+)\]/ and $4 > 3) { $line =~ s/(ldr[bh]?)\s+(\w+),\s*\[\2,\s*(\w+),\s*lsl\s*#(\w+)\]/add $2, $2, $3, lsl #$4\n\t$1 $2, [$2]/; } $line =~ s/\.arm/.thumb/x; } # comment out unsupported directives $line =~ s/\.type/$comm$&/x if $as_type =~ /^(apple-|armasm)/; $line =~ s/\.func/$comm$&/x if $as_type =~ /^(apple-|clang)/; $line =~ s/\.endfunc/$comm$&/x if $as_type =~ /^(apple-|clang)/; $line =~ s/\.endfunc/ENDP/x if $as_type =~ /armasm/; $line =~ s/\.ltorg/$comm$&/x if $as_type =~ /^(apple-|clang)/; $line =~ s/\.ltorg/LTORG/x if $as_type eq "armasm"; $line =~ s/\.size/$comm$&/x if $as_type =~ /^(apple-|armasm)/; $line =~ s/\.fpu/$comm$&/x if $as_type =~ /^(apple-|armasm)/; $line =~ s/\.arch/$comm$&/x if $as_type =~ /^(apple-|clang|armasm)/; $line =~ s/\.object_arch/$comm$&/x if $as_type =~ /^(apple-|armasm)/; $line =~ s/.section\s+.note.GNU-stack.*/$comm$&/x if $as_type =~ /^(apple-|armasm)/; $line =~ s/\.syntax/$comm$&/x if $as_type =~ /armasm/; $line =~ s/\.hword/.short/x; $line =~ s/\.xword/.quad/x; $line =~ s/\.dword/.quad/x; if ($as_type =~ /^apple-/) { # the syntax for these is a little different $line =~ s/\.global/.globl/x; # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const $line =~ s/(.*)\.rodata/.const_data/x; $line =~ s/\.int/.long/x; $line =~ s/\.float/.single/x; } if ($as_type eq "apple-gas") { $line =~ s/vmrs\s+APSR_nzcv/fmrx r15/x; } if ($as_type eq "armasm") { $line =~ s/\.global/EXPORT/x; $line =~ s/\.extern/IMPORT/x; $line =~ s/\.int/dcd/x; $line =~ s/\.long/dcd/x; $line =~ s/\.float/dcfs/x; $line =~ s/\.word/dcd/x; $line =~ s/\.short/dcw/x; $line =~ s/\.byte/dcb/x; $line =~ s/\.quad/dcq/x; $line =~ s/\.ascii/dcb/x; $line =~ s/\.asciz(.*)$/dcb\1,0/x; $line =~ s/\.thumb/THUMB/x; $line =~ s/\.arm/ARM/x; # The alignment in AREA is the power of two, just as .align in gas $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=4, CODEALIGN/; $line =~ s/(\s*)(.*)\.ro?data(\s*,\s*"\w+")?/$1AREA |.rdata|, DATA, READONLY, ALIGN=5/; $line =~ s/\.data/AREA |.data|, DATA, ALIGN=5/; } if ($as_type eq "armasm" and $arch eq "arm") { $line =~ s/fmxr/vmsr/; $line =~ s/fmrx/vmrs/; $line =~ s/fadds/vadd.f32/; # Armasm in VS 2019 16.3 errors out on "it" instructions. But # armasm implicitly adds the necessary it instructions anyway, so we # can just filter them out. $line =~ s/^\s*it[te]*\s+/$comm$&/; } if ($as_type eq "armasm" and $arch eq "aarch64") { # Convert "b.eq" into "beq" $line =~ s/\bb\.($arm_cond_codes)\b/b\1/; } # catch unknown section names that aren't mach-o style (with a comma) if ($as_type =~ /apple-/ and $line =~ /.section ([^,]*)$/) { die ".section $1 unsupported; figure out the mach-o section name and add it"; } print ASMFILE $line; } if ($as_type ne "armasm") { print ASMFILE ".text\n"; print ASMFILE ".align 2\n"; foreach my $literal (keys %literal_labels) { print ASMFILE "$literal_labels{$literal}:\n $literal_expr $literal\n"; } map print(ASMFILE ".thumb_func $_\n"), grep exists $thumb_labels{$_}, keys %call_targets; } else { map print(ASMFILE "\tIMPORT $_\n"), grep ! exists $labels_seen{$_}, (keys %call_targets, keys %import_symbols); print ASMFILE "\tEND\n"; } close(INPUT) or exit 1; close(ASMFILE) or exit 1; if ($as_type eq "armasm" and ! defined $ENV{GASPP_DEBUG}) { print STDERR join(" ", @gcc_cmd)."\n" if $verbose; system(@gcc_cmd) == 0 or die "Error running assembler"; } END { unlink($tempfile) if defined $tempfile; } #exit 1 x264-master/tools/msvsdepend.sh000077500000000000000000000037401502133446700167320ustar00rootroot00000000000000#!/bin/sh # Output a Makefile rule describing the dependencies of a given source file. # Expected arguments are $(CC) $(CFLAGS) $(SRC) $(OBJ) set -f [ -n "$1" ] && [ -n "$3" ] && [ -n "$4" ] || exit 1 # Add flags to only perform syntax checking and output a list of included files # For sources that aren't C, run preprocessing to NUL instead. case "$3" in *.c) opts="-W0 -Zs" ;; *) opts="-P -FiNUL" ;; esac # Discard all output other than included files # Convert '\' directory separators to '/' # Remove system includes (hack: check for "/Program Files" string in path) # Add the source file itself as a dependency deps="$($1 $2 -nologo -showIncludes $opts "$3" 2>&1 | grep '^Note: including file:' | sed 's/^Note: including file:[[:space:]]*\(.*\)$/\1/; s/\\/\//g' | sed '/\/[Pp]rogram [Ff]iles/d') $3" # Convert Windows paths to Unix paths if possible if command -v cygpath >/dev/null 2>&1 ; then IFS=' ' deps="$(cygpath -u -- $deps)" elif grep -q 'Microsoft' /proc/sys/kernel/osrelease 2>/dev/null ; then # Running under WSL. We don't have access to cygpath but since the Windows # file system resides under "/mnt//" we can simply replace # "C:" with "/mnt/c". This command uses a GNU extension to sed but that's # available on WSL so we don't need to limit ourselves by what POSIX says. deps="$(printf '%s' "$deps" | sed 's/^\([a-zA-Z]\):/\/mnt\/\L\1/')" fi # Escape characters as required to create valid Makefile file names escape() { sed 's/ /\\ /g; s/#/\\#/g; s/\$/\$\$/g' } # Remove prefixes that are equal to the working directory # Sort and remove duplicate entries # Escape and collapse the dependencies into one line deps="$(printf '%s' "$deps" | sed "s/^$(pwd | sed 's/\//\\\//g')\///; s/^\.\///" | sort | uniq | escape | tr -s '\n\r' ' ' | sed 's/^ *\(.*\) $/\1/')" # Escape the target file name as well target="$(printf '%s' "$4" | escape)" printf '%s: %s\n' "$target" "$deps" x264-master/tools/q_matrix_jvt.cfg000066400000000000000000000027101502133446700174070ustar00rootroot00000000000000# This an example configuration file for initializing the quantization matrix. # Altogether 6 matrices for 4x4 blocks and 2 matrix for 8x8 blocks. # The values range from 1 to 255. # If first value of matrix is equal to 0, default values ("JVT") will be used # for that matrix. # If a matrix is completely omitted, it will be filled with 16s. # # Note: JM expects CHROMAU and CHROMAV to be specified separately, whereas # x264 forces them to use the same matrix. If U and V are specified to have # different matrices, only the first is used. #################################################################################### INTRA4X4_LUMA = 6,13,20,28, 13,20,28,32, 20,28,32,37, 28,32,37,42 INTRA4X4_CHROMAU = 6,13,20,28, 13,20,28,32, 20,28,32,37, 28,32,37,42 INTRA4X4_CHROMAV = 6,13,20,28, 13,20,28,32, 20,28,32,37, 28,32,37,42 INTER4X4_LUMA = 10,14,20,24, 14,20,24,27, 20,24,27,30, 24,27,30,34 INTER4X4_CHROMAU = 10,14,20,24, 14,20,24,27, 20,24,27,30, 24,27,30,34 INTER4X4_CHROMAV = 10,14,20,24, 14,20,24,27, 20,24,27,30, 24,27,30,34 INTRA8X8_LUMA = 6,10,13,16,18,23,25,27, 10,11,16,18,23,25,27,29, 13,16,18,23,25,27,29,31, 16,18,23,25,27,29,31,33, 18,23,25,27,29,31,33,36, 23,25,27,29,31,33,36,38, 25,27,29,31,33,36,38,40, 27,29,31,33,36,38,40,42 INTER8X8_LUMA = 9,13,15,17,19,21,22,24, 13,13,17,19,21,22,24,25, 15,17,19,21,22,24,25,27, 17,19,21,22,24,25,27,28, 19,21,22,24,25,27,28,30, 21,22,24,25,27,28,30,32, 22,24,25,27,28,30,32,33, 24,25,27,28,30,32,33,35 x264-master/tools/test_x264.py000077500000000000000000000373151502133446700163470ustar00rootroot00000000000000#!/usr/bin/env python import operator from optparse import OptionGroup import sys from time import time from digress.cli import Dispatcher as _Dispatcher from digress.errors import ComparisonError, FailedTestError, DisabledTestError from digress.testing import depends, comparer, Fixture, Case from digress.comparers import compare_pass from digress.scm import git as x264git from subprocess import Popen, PIPE, STDOUT import os import re import shlex import inspect from random import randrange, seed from math import ceil from itertools import imap, izip os.chdir(os.path.join(os.path.dirname(__file__), "..")) # options OPTIONS = [ [ "--tune %s" % t for t in ("film", "zerolatency") ], ("", "--intra-refresh"), ("", "--no-cabac"), ("", "--interlaced"), ("", "--slice-max-size 1000"), ("", "--frame-packing 5"), [ "--preset %s" % p for p in ("ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", "placebo") ] ] # end options def compare_yuv_output(width, height): def _compare_yuv_output(file_a, file_b): size_a = os.path.getsize(file_a) size_b = os.path.getsize(file_b) if size_a != size_b: raise ComparisonError("%s is not the same size as %s" % ( file_a, file_b )) BUFFER_SIZE = 8196 offset = 0 with open(file_a) as f_a: with open(file_b) as f_b: for chunk_a, chunk_b in izip( imap( lambda i: f_a.read(BUFFER_SIZE), xrange(size_a // BUFFER_SIZE + 1) ), imap( lambda i: f_b.read(BUFFER_SIZE), xrange(size_b // BUFFER_SIZE + 1) ) ): chunk_size = len(chunk_a) if chunk_a != chunk_b: for i in xrange(chunk_size): if chunk_a[i] != chunk_b[i]: # calculate the macroblock, plane and frame from the offset offs = offset + i y_plane_area = width * height u_plane_area = y_plane_area + y_plane_area * 0.25 v_plane_area = u_plane_area + y_plane_area * 0.25 pixel = offs % v_plane_area frame = offs // v_plane_area if pixel < y_plane_area: plane = "Y" pixel_x = pixel % width pixel_y = pixel // width macroblock = (ceil(pixel_x / 16.0), ceil(pixel_y / 16.0)) elif pixel < u_plane_area: plane = "U" pixel -= y_plane_area pixel_x = pixel % width pixel_y = pixel // width macroblock = (ceil(pixel_x / 8.0), ceil(pixel_y / 8.0)) else: plane = "V" pixel -= u_plane_area pixel_x = pixel % width pixel_y = pixel // width macroblock = (ceil(pixel_x / 8.0), ceil(pixel_y / 8.0)) macroblock = tuple([ int(x) for x in macroblock ]) raise ComparisonError("%s differs from %s at frame %d, " \ "macroblock %s on the %s plane (offset %d)" % ( file_a, file_b, frame, macroblock, plane, offs) ) offset += chunk_size return _compare_yuv_output def program_exists(program): def is_exe(fpath): return os.path.exists(fpath) and os.access(fpath, os.X_OK) fpath, fname = os.path.split(program) if fpath: if is_exe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): exe_file = os.path.join(path, program) if is_exe(exe_file): return exe_file return None class x264(Fixture): scm = x264git class Compile(Case): @comparer(compare_pass) def test_configure(self): Popen([ "make", "distclean" ], stdout=PIPE, stderr=STDOUT).communicate() configure_proc = Popen([ "./configure" ] + self.fixture.dispatcher.configure, stdout=PIPE, stderr=STDOUT) output = configure_proc.communicate()[0] if configure_proc.returncode != 0: raise FailedTestError("configure failed: %s" % output.replace("\n", " ")) @depends("configure") @comparer(compare_pass) def test_make(self): make_proc = Popen([ "make", "-j5" ], stdout=PIPE, stderr=STDOUT) output = make_proc.communicate()[0] if make_proc.returncode != 0: raise FailedTestError("make failed: %s" % output.replace("\n", " ")) _dimension_pattern = re.compile(r"\w+ [[]info[]]: (\d+)x(\d+)[pi] \d+:\d+ @ \d+/\d+ fps [(][vc]fr[)]") def _YUVOutputComparisonFactory(): class YUVOutputComparison(Case): _dimension_pattern = _dimension_pattern depends = [ Compile ] options = [] def __init__(self): for name, meth in inspect.getmembers(self): if name[:5] == "test_" and name[5:] not in self.fixture.dispatcher.yuv_tests: delattr(self.__class__, name) def _run_x264(self): x264_proc = Popen([ "./x264", "-o", "%s.264" % self.fixture.dispatcher.video, "--dump-yuv", "x264-output.yuv" ] + self.options + [ self.fixture.dispatcher.video ], stdout=PIPE, stderr=STDOUT) output = x264_proc.communicate()[0] if x264_proc.returncode != 0: raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " ")) matches = _dimension_pattern.match(output) return (int(matches.group(1)), int(matches.group(2))) @comparer(compare_pass) def test_jm(self): if not program_exists("ldecod"): raise DisabledTestError("jm unavailable") try: runres = self._run_x264() jm_proc = Popen([ "ldecod", "-i", "%s.264" % self.fixture.dispatcher.video, "-o", "jm-output.yuv" ], stdout=PIPE, stderr=STDOUT) output = jm_proc.communicate()[0] if jm_proc.returncode != 0: raise FailedTestError("jm did not complete properly: %s" % output.replace("\n", " ")) try: compare_yuv_output(*runres)("x264-output.yuv", "jm-output.yuv") except ComparisonError, e: raise FailedTestError(e) finally: try: os.remove("x264-output.yuv") except: pass try: os.remove("%s.264" % self.fixture.dispatcher.video) except: pass try: os.remove("jm-output.yuv") except: pass try: os.remove("log.dec") except: pass try: os.remove("dataDec.txt") except: pass @comparer(compare_pass) def test_ffmpeg(self): if not program_exists("ffmpeg"): raise DisabledTestError("ffmpeg unavailable") try: runres = self._run_x264() ffmpeg_proc = Popen([ "ffmpeg", "-vsync 0", "-i", "%s.264" % self.fixture.dispatcher.video, "ffmpeg-output.yuv" ], stdout=PIPE, stderr=STDOUT) output = ffmpeg_proc.communicate()[0] if ffmpeg_proc.returncode != 0: raise FailedTestError("ffmpeg did not complete properly: %s" % output.replace("\n", " ")) try: compare_yuv_output(*runres)("x264-output.yuv", "ffmpeg-output.yuv") except ComparisonError, e: raise FailedTestError(e) finally: try: os.remove("x264-output.yuv") except: pass try: os.remove("%s.264" % self.fixture.dispatcher.video) except: pass try: os.remove("ffmpeg-output.yuv") except: pass return YUVOutputComparison class Regression(Case): depends = [ Compile ] _psnr_pattern = re.compile(r"x264 [[]info[]]: PSNR Mean Y:\d+[.]\d+ U:\d+[.]\d+ V:\d+[.]\d+ Avg:\d+[.]\d+ Global:(\d+[.]\d+) kb/s:\d+[.]\d+") _ssim_pattern = re.compile(r"x264 [[]info[]]: SSIM Mean Y:(\d+[.]\d+) [(]\d+[.]\d+db[)]") def __init__(self): if self.fixture.dispatcher.x264: self.__class__.__name__ += " %s" % " ".join(self.fixture.dispatcher.x264) def test_psnr(self): try: x264_proc = Popen([ "./x264", "-o", "%s.264" % self.fixture.dispatcher.video, "--psnr" ] + self.fixture.dispatcher.x264 + [ self.fixture.dispatcher.video ], stdout=PIPE, stderr=STDOUT) output = x264_proc.communicate()[0] if x264_proc.returncode != 0: raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " ")) for line in output.split("\n"): if line.startswith("x264 [info]: PSNR Mean"): return float(self._psnr_pattern.match(line).group(1)) raise FailedTestError("no PSNR output caught from x264") finally: try: os.remove("%s.264" % self.fixture.dispatcher.video) except: pass def test_ssim(self): try: x264_proc = Popen([ "./x264", "-o", "%s.264" % self.fixture.dispatcher.video, "--ssim" ] + self.fixture.dispatcher.x264 + [ self.fixture.dispatcher.video ], stdout=PIPE, stderr=STDOUT) output = x264_proc.communicate()[0] if x264_proc.returncode != 0: raise FailedTestError("x264 did not complete properly: %s" % output.replace("\n", " ")) for line in output.split("\n"): if line.startswith("x264 [info]: SSIM Mean"): return float(self._ssim_pattern.match(line).group(1)) raise FailedTestError("no PSNR output caught from x264") finally: try: os.remove("%s.264" % self.fixture.dispatcher.video) except: pass def _generate_random_commandline(): commandline = [] for suboptions in OPTIONS: commandline.append(suboptions[randrange(0, len(suboptions))]) return filter(None, reduce(operator.add, [ shlex.split(opt) for opt in commandline ])) _generated = [] fixture = x264() fixture.register_case(Compile) fixture.register_case(Regression) class Dispatcher(_Dispatcher): video = "akiyo_qcif.y4m" products = 50 configure = [] x264 = [] yuv_tests = [ "jm" ] def _populate_parser(self): super(Dispatcher, self)._populate_parser() # don't do a whole lot with this tcase = _YUVOutputComparisonFactory() yuv_tests = [ name[5:] for name, meth in filter(lambda pair: pair[0][:5] == "test_", inspect.getmembers(tcase)) ] group = OptionGroup(self.optparse, "x264 testing-specific options") group.add_option( "-v", "--video", metavar="FILENAME", action="callback", dest="video", type=str, callback=lambda option, opt, value, parser: setattr(self, "video", value), help="yuv video to perform testing on (default: %s)" % self.video ) group.add_option( "-s", "--seed", metavar="SEED", action="callback", dest="seed", type=int, callback=lambda option, opt, value, parser: setattr(self, "seed", value), help="seed for the random number generator (default: unix timestamp)" ) group.add_option( "-p", "--product-tests", metavar="NUM", action="callback", dest="video", type=int, callback=lambda option, opt, value, parser: setattr(self, "products", value), help="number of cartesian products to generate for yuv comparison testing (default: %d)" % self.products ) group.add_option( "--configure-with", metavar="FLAGS", action="callback", dest="configure", type=str, callback=lambda option, opt, value, parser: setattr(self, "configure", shlex.split(value)), help="options to run ./configure with" ) group.add_option( "--yuv-tests", action="callback", dest="yuv_tests", type=str, callback=lambda option, opt, value, parser: setattr(self, "yuv_tests", [ val.strip() for val in value.split(",") ]), help="select tests to run with yuv comparisons (default: %s, available: %s)" % ( ", ".join(self.yuv_tests), ", ".join(yuv_tests) ) ) group.add_option( "--x264-with", metavar="FLAGS", action="callback", dest="x264", type=str, callback=lambda option, opt, value, parser: setattr(self, "x264", shlex.split(value)), help="additional options to run ./x264 with" ) self.optparse.add_option_group(group) def pre_dispatch(self): if not hasattr(self, "seed"): self.seed = int(time()) print "Using seed: %d" % self.seed seed(self.seed) for i in xrange(self.products): YUVOutputComparison = _YUVOutputComparisonFactory() commandline = _generate_random_commandline() counter = 0 while commandline in _generated: counter += 1 commandline = _generate_random_commandline() if counter > 100: print >>sys.stderr, "Maximum command-line regeneration exceeded. " \ "Try a different seed or specify fewer products to generate." sys.exit(1) commandline += self.x264 _generated.append(commandline) YUVOutputComparison.options = commandline YUVOutputComparison.__name__ = ("%s %s" % (YUVOutputComparison.__name__, " ".join(commandline))) fixture.register_case(YUVOutputComparison) Dispatcher(fixture).dispatch() x264-master/version.sh000077500000000000000000000015261502133446700151070ustar00rootroot00000000000000#!/bin/sh cd "$(dirname "$0")" >/dev/null && [ -f x264.h ] || exit 1 api="$(grep '#define X264_BUILD' < x264.h | sed 's/^.* \([1-9][0-9]*\).*$/\1/')" ver="x" version="" if [ -d .git ] && command -v git >/dev/null 2>&1 ; then localver="$(($(git rev-list HEAD | wc -l)))" if [ "$localver" -gt 1 ] ; then ver_diff="$(($(git rev-list origin/master..HEAD | wc -l)))" ver="$((localver-ver_diff))" echo "#define X264_REV $ver" echo "#define X264_REV_DIFF $ver_diff" if [ "$ver_diff" -ne 0 ] ; then ver="$ver+$ver_diff" fi if git status | grep -q "modified:" ; then ver="${ver}M" fi ver="$ver $(git rev-list -n 1 HEAD | cut -c 1-7)" version=" r$ver" fi fi echo "#define X264_VERSION \"$version\"" echo "#define X264_POINTVER \"0.$api.$ver\"" x264-master/x264.c000066400000000000000000002653301502133446700137370ustar00rootroot00000000000000/***************************************************************************** * x264: top-level x264cli functions ***************************************************************************** * Copyright (C) 2003-2025 x264 project * * Authors: Loren Merritt * Laurent Aimar * Steven Walters * Fiona Glaser * Kieran Kunhya * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. * * This program is also available under a commercial proprietary license. * For more information, contact us at licensing@x264.com. *****************************************************************************/ #ifdef _WIN32 /* The following two defines must be located before the inclusion of any system header files. */ #define WINVER 0x0500 #define _WIN32_WINNT 0x0500 #endif #include "x264cli.h" #ifdef _WIN32 #include #include /* _setmode() */ #include /* _O_BINARY */ #endif #include #include #include "input/input.h" #include "output/output.h" #include "filters/filters.h" #define QP_MAX_SPEC (51+6*2) #define QP_MAX (QP_MAX_SPEC+18) #define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ ) #if HAVE_LAVF #undef DECLARE_ALIGNED #include #include #include #endif #if HAVE_SWSCALE #undef DECLARE_ALIGNED #include #endif #if HAVE_FFMS #include #endif #if HAVE_GPAC #include #endif #if HAVE_LSMASH #include #endif #ifdef _WIN32 #define CONSOLE_TITLE_SIZE 200 static wchar_t org_console_title[CONSOLE_TITLE_SIZE] = L""; void x264_cli_set_console_title( const char *title ) { wchar_t title_utf16[CONSOLE_TITLE_SIZE]; if( MultiByteToWideChar( CP_UTF8, MB_ERR_INVALID_CHARS, title, -1, title_utf16, CONSOLE_TITLE_SIZE ) ) SetConsoleTitleW( title_utf16 ); } /* Retrieve command line arguments as UTF-8. */ static int get_argv_utf8( int *argc_ptr, char ***argv_ptr ) { int ret = 0; wchar_t **argv_utf16 = CommandLineToArgvW( GetCommandLineW(), argc_ptr ); if( argv_utf16 ) { int argc = *argc_ptr; int offset = (argc+1) * sizeof(char*); int size = offset; for( int i = 0; i < argc; i++ ) size += WideCharToMultiByte( CP_UTF8, 0, argv_utf16[i], -1, NULL, 0, NULL, NULL ); char **argv = *argv_ptr = malloc( size ); if( argv ) { for( int i = 0; i < argc; i++ ) { argv[i] = (char*)argv + offset; offset += WideCharToMultiByte( CP_UTF8, 0, argv_utf16[i], -1, argv[i], size-offset, NULL, NULL ); } argv[argc] = NULL; ret = 1; } LocalFree( argv_utf16 ); } return ret; } #endif /* Ctrl-C handler */ static volatile int b_ctrl_c = 0; static void sigint_handler( int a ) { b_ctrl_c = 1; } typedef struct { int b_progress; int i_seek; hnd_t hin; hnd_t hout; FILE *qpfile; FILE *tcfile_out; double timebase_convert_multiplier; int i_pulldown; } cli_opt_t; /* file i/o operation structs */ cli_input_t cli_input; static cli_output_t cli_output; /* video filter operation struct */ static cli_vid_filter_t filter; const char * const x264_avcintra_class_names[] = { "50", "100", "200", "300", "480", 0 }; const char * const x264_cqm_names[] = { "flat", "jvt", 0 }; const char * const x264_log_level_names[] = { "none", "error", "warning", "info", "debug", 0 }; const char * const x264_partition_names[] = { "p8x8", "p4x4", "b8x8", "i8x8", "i4x4", "none", "all", 0 }; const char * const x264_pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 }; const char * const x264_range_names[] = { "auto", "tv", "pc", 0 }; const char * const x264_output_csp_names[] = { #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I400 "i400", #endif #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I420 "i420", #endif #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I422 "i422", #endif #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I444 "i444", "rgb", #endif 0 }; const char * const x264_valid_profile_names[] = { #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT <= X264_CSP_I420 #if HAVE_BITDEPTH8 #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I420 "baseline", "main", #endif "high", #endif #if HAVE_BITDEPTH10 "high10", #endif #endif #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I422 "high422", #endif "high444", 0 }; const char * const x264_demuxer_names[] = { "auto", "raw", "y4m", #if HAVE_AVS "avs", #endif #if HAVE_LAVF "lavf", #endif #if HAVE_FFMS "ffms", #endif 0 }; const char * const x264_muxer_names[] = { "auto", "raw", "mkv", "flv", #if HAVE_GPAC || HAVE_LSMASH "mp4", #endif 0 }; static const char * const chroma_format_names[] = { [0] = "all", [X264_CSP_I400] = "i400", [X264_CSP_I420] = "i420", [X264_CSP_I422] = "i422", [X264_CSP_I444] = "i444" }; typedef struct { int mod; uint8_t pattern[24]; float fps_factor; } cli_pulldown_t; enum pulldown_type_e { X264_PULLDOWN_22 = 1, X264_PULLDOWN_32, X264_PULLDOWN_64, X264_PULLDOWN_DOUBLE, X264_PULLDOWN_TRIPLE, X264_PULLDOWN_EURO }; #define TB PIC_STRUCT_TOP_BOTTOM #define BT PIC_STRUCT_BOTTOM_TOP #define TBT PIC_STRUCT_TOP_BOTTOM_TOP #define BTB PIC_STRUCT_BOTTOM_TOP_BOTTOM static const cli_pulldown_t pulldown_values[] = { [X264_PULLDOWN_22] = {1, {TB}, 1.0}, [X264_PULLDOWN_32] = {4, {TBT, BT, BTB, TB}, 1.25}, [X264_PULLDOWN_64] = {2, {PIC_STRUCT_DOUBLE, PIC_STRUCT_TRIPLE}, 1.0}, [X264_PULLDOWN_DOUBLE] = {1, {PIC_STRUCT_DOUBLE}, 2.0}, [X264_PULLDOWN_TRIPLE] = {1, {PIC_STRUCT_TRIPLE}, 3.0}, [X264_PULLDOWN_EURO] = {24, {TBT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BT, BTB, TB, TB, TB, TB, TB, TB, TB, TB, TB, TB, TB}, 25.0/24.0} }; #undef TB #undef BT #undef TBT #undef BTB // indexed by pic_struct enum static const float pulldown_frame_duration[10] = { 0.0, 1, 0.5, 0.5, 1, 1, 1.5, 1.5, 2, 3 }; static void help( x264_param_t *defaults, int longhelp ); static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt ); static int encode( x264_param_t *param, cli_opt_t *opt ); /* logging and printing for within the cli system */ static int cli_log_level = X264_LOG_INFO; void x264_cli_log( const char *name, int i_level, const char *fmt, ... ) { if( i_level > cli_log_level ) return; char *s_level; switch( i_level ) { case X264_LOG_ERROR: s_level = "error"; break; case X264_LOG_WARNING: s_level = "warning"; break; case X264_LOG_INFO: s_level = "info"; break; case X264_LOG_DEBUG: s_level = "debug"; break; default: s_level = "unknown"; break; } fprintf( stderr, "%s [%s]: ", name, s_level ); va_list arg; va_start( arg, fmt ); x264_vfprintf( stderr, fmt, arg ); va_end( arg ); } void x264_cli_printf( int i_level, const char *fmt, ... ) { if( i_level > cli_log_level ) return; va_list arg; va_start( arg, fmt ); x264_vfprintf( stderr, fmt, arg ); va_end( arg ); } static void print_version_info( void ) { #ifdef X264_POINTVER printf( "x264 "X264_POINTVER"\n" ); #else printf( "x264 0.%d.X\n", X264_BUILD ); #endif #if HAVE_SWSCALE printf( "(libswscale %d.%d.%d)\n", LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO ); #endif #if HAVE_LAVF printf( "(libavformat %d.%d.%d)\n", LIBAVFORMAT_VERSION_MAJOR, LIBAVFORMAT_VERSION_MINOR, LIBAVFORMAT_VERSION_MICRO ); #endif #if HAVE_FFMS printf( "(ffmpegsource %d.%d.%d.%d)\n", FFMS_VERSION >> 24, (FFMS_VERSION & 0xff0000) >> 16, (FFMS_VERSION & 0xff00) >> 8, FFMS_VERSION & 0xff ); #endif #if HAVE_GPAC printf( "(gpac " GPAC_VERSION ")\n" ); #endif #if HAVE_LSMASH printf( "(lsmash %d.%d.%d)\n", LSMASH_VERSION_MAJOR, LSMASH_VERSION_MINOR, LSMASH_VERSION_MICRO ); #endif printf( "built on " __DATE__ ", " ); #ifdef __INTEL_COMPILER printf( "intel: %.2f (%d)\n", __INTEL_COMPILER / 100.f, __INTEL_COMPILER_BUILD_DATE ); #elif defined(__clang__) printf( "clang: " __clang_version__ "\n" ); #elif defined(__GNUC__) printf( "gcc: " __VERSION__ "\n" ); #elif defined(_MSC_FULL_VER) printf( "msvc: %.2f (%u)\n", _MSC_VER / 100.f, _MSC_FULL_VER ); #else printf( "using an unknown compiler\n" ); #endif printf( "x264 configuration: --chroma-format=%s\n", chroma_format_names[X264_CHROMA_FORMAT] ); printf( "libx264 configuration: --chroma-format=%s\n", chroma_format_names[x264_chroma_format] ); printf( "x264 license: " ); #if HAVE_GPL printf( "GPL version 2 or later\n" ); #else printf( "Non-GPL commercial\n" ); #endif #if HAVE_SWSCALE const char *license = swscale_license(); printf( "libswscale%s%s license: %s\n", HAVE_LAVF ? "/libavformat" : "", HAVE_FFMS ? "/ffmpegsource" : "" , license ); if( !strcmp( license, "nonfree and unredistributable" ) || (!HAVE_GPL && (!strcmp( license, "GPL version 2 or later" ) || !strcmp( license, "GPL version 3 or later" )))) printf( "WARNING: This binary is unredistributable!\n" ); #endif } REALIGN_STACK int main( int argc, char **argv ) { if( argc == 4 && !strcmp( argv[1], "--autocomplete" ) ) return x264_cli_autocomplete( argv[2], argv[3] ); x264_param_t param; cli_opt_t opt = {0}; int ret = 0; FAIL_IF_ERROR( x264_threading_init(), "unable to initialize threading\n" ); #ifdef _WIN32 FAIL_IF_ERROR( !get_argv_utf8( &argc, &argv ), "unable to convert command line to UTF-8\n" ); GetConsoleTitleW( org_console_title, CONSOLE_TITLE_SIZE ); _setmode( _fileno( stdin ), _O_BINARY ); _setmode( _fileno( stdout ), _O_BINARY ); _setmode( _fileno( stderr ), _O_BINARY ); #endif x264_param_default( ¶m ); /* Parse command line */ if( parse( argc, argv, ¶m, &opt ) < 0 ) ret = -1; #ifdef _WIN32 /* Restore title; it can be changed by input modules */ SetConsoleTitleW( org_console_title ); #endif /* Control-C handler */ signal( SIGINT, sigint_handler ); if( !ret ) ret = encode( ¶m, &opt ); /* clean up handles */ if( filter.free ) filter.free( opt.hin ); else if( opt.hin ) cli_input.close_file( opt.hin ); if( opt.hout ) cli_output.close_file( opt.hout, 0, 0 ); if( opt.tcfile_out ) fclose( opt.tcfile_out ); if( opt.qpfile ) fclose( opt.qpfile ); x264_param_cleanup( ¶m ); #ifdef _WIN32 SetConsoleTitleW( org_console_title ); free( argv ); #endif return ret; } static char const *strtable_lookup( const char * const table[], int idx ) { int i = 0; while( table[i] ) i++; return ( idx >= 0 && idx < i && *table[idx] ) ? table[idx] : "???"; } static char *stringify_names( char *buf, const char * const names[] ) { int i = 0; char *p = buf; for( p[0] = 0; names[i]; i++ ) if( *names[i] ) { if( p != buf ) p += sprintf( p, ", " ); p += sprintf( p, "%s", names[i] ); } return buf; } #define INDENT " " #define INDENT_LEN 32 // strlen( INDENT ) #define SEPARATOR ", " #define SEPARATOR_LEN 2 // strlen( SEPARATOR ) static void print_csp_name_internal( const char *name, size_t *line_len, int last ) { if( name ) { size_t name_len = strlen( name ); if( *line_len + name_len > (80 - SEPARATOR_LEN) ) { printf( "\n" INDENT ); *line_len = INDENT_LEN; } printf( "%s", name ); *line_len += name_len; if( !last ) { printf( SEPARATOR ); *line_len += SEPARATOR_LEN; } } } static void print_csp_names( int longhelp ) { if( longhelp < 2 ) return; printf( " - valid csps for `raw' demuxer:\n" ); printf( INDENT ); size_t line_len = INDENT_LEN; for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ ) print_csp_name_internal( x264_cli_csps[i].name, &line_len, i == X264_CSP_CLI_MAX-1 ); #if HAVE_LAVF printf( "\n" ); printf( " - valid csps for `lavf' demuxer:\n" ); printf( INDENT ); line_len = INDENT_LEN; for( enum AVPixelFormat i = AV_PIX_FMT_NONE+1; i < AV_PIX_FMT_NB; i++ ) print_csp_name_internal( av_get_pix_fmt_name( i ), &line_len, i == AV_PIX_FMT_NB-1 ); #endif printf( "\n" ); } static void help( x264_param_t *defaults, int longhelp ) { char buf[200]; #define H0 printf #define H1 if( longhelp >= 1 ) printf #define H2 if( longhelp == 2 ) printf H0( "x264 core:%d%s\n" "Syntax: x264 [options] -o outfile infile\n" "\n" "Infile can be raw (in which case resolution is required),\n" " or YUV4MPEG (*.y4m),\n" " or Avisynth if compiled with support (%s).\n" " or libav* formats if compiled with lavf support (%s) or ffms support (%s).\n" "Outfile type is selected by filename:\n" " .264 -> Raw bytestream\n" " .mkv -> Matroska\n" " .flv -> Flash Video\n" " .mp4 -> MP4 if compiled with GPAC or L-SMASH support (%s)\n" "Output bit depth: %s\n" "\n" "Options:\n" "\n" " -h, --help List basic options\n" " --longhelp List more options\n" " --fullhelp List all options\n" "\n", X264_BUILD, X264_VERSION, #if HAVE_AVS "yes", #else "no", #endif #if HAVE_LAVF "yes", #else "no", #endif #if HAVE_FFMS "yes", #else "no", #endif #if HAVE_GPAC "gpac", #elif HAVE_LSMASH "lsmash", #else "no", #endif #if HAVE_BITDEPTH8 && HAVE_BITDEPTH10 "8/10" #elif HAVE_BITDEPTH8 "8" #elif HAVE_BITDEPTH10 "10" #else "none" #endif ); H0( "Example usage:\n" ); H0( "\n" ); H0( " Constant quality mode:\n" ); H0( " x264 --crf 24 -o \n" ); H0( "\n" ); H0( " Two-pass with a bitrate of 1000kbps:\n" ); H0( " x264 --pass 1 --bitrate 1000 -o \n" ); H0( " x264 --pass 2 --bitrate 1000 -o \n" ); H0( "\n" ); H0( " Lossless:\n" ); H0( " x264 --qp 0 -o \n" ); H0( "\n" ); H0( " Maximum PSNR at the cost of speed and visual quality:\n" ); H0( " x264 --preset placebo --tune psnr -o \n" ); H0( "\n" ); H0( " Constant bitrate at 1000kbps with a 2 second-buffer:\n"); H0( " x264 --vbv-bufsize 2000 --bitrate 1000 -o \n" ); H0( "\n" ); H0( "Presets:\n" ); H0( "\n" ); H0( " --profile Force the limits of an H.264 profile\n" " Overrides all settings.\n" ); H2( #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT <= X264_CSP_I420 #if HAVE_BITDEPTH8 #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I420 " - baseline:\n" " --no-8x8dct --bframes 0 --no-cabac\n" " --cqm flat --weightp 0\n" " No interlaced.\n" " No lossless.\n" " - main:\n" " --no-8x8dct --cqm flat\n" " No lossless.\n" #endif " - high:\n" " No lossless.\n" #endif #if HAVE_BITDEPTH10 " - high10:\n" " No lossless.\n" " Support for bit depth 8-10.\n" #endif #endif #if !X264_CHROMA_FORMAT || X264_CHROMA_FORMAT == X264_CSP_I422 " - high422:\n" " No lossless.\n" " Support for bit depth 8-10.\n" " Support for 4:2:0/4:2:2 chroma subsampling.\n" #endif " - high444:\n" " Support for bit depth 8-10.\n" " Support for 4:2:0/4:2:2/4:4:4 chroma subsampling.\n" ); else H0( " - %s\n", stringify_names( buf, x264_valid_profile_names ) ); H0( " --preset Use a preset to select encoding settings [medium]\n" " Overridden by user settings.\n" ); H2( " - ultrafast:\n" " --no-8x8dct --aq-mode 0 --b-adapt 0\n" " --bframes 0 --no-cabac --no-deblock\n" " --no-mbtree --me dia --no-mixed-refs\n" " --partitions none --rc-lookahead 0 --ref 1\n" " --scenecut 0 --subme 0 --trellis 0\n" " --no-weightb --weightp 0\n" " - superfast:\n" " --no-mbtree --me dia --no-mixed-refs\n" " --partitions i8x8,i4x4 --rc-lookahead 0\n" " --ref 1 --subme 1 --trellis 0 --weightp 1\n" " - veryfast:\n" " --no-mixed-refs --rc-lookahead 10\n" " --ref 1 --subme 2 --trellis 0 --weightp 1\n" " - faster:\n" " --no-mixed-refs --rc-lookahead 20\n" " --ref 2 --subme 4 --weightp 1\n" " - fast:\n" " --rc-lookahead 30 --ref 2 --subme 6\n" " --weightp 1\n" " - medium:\n" " Default settings apply.\n" " - slow:\n" " --direct auto --rc-lookahead 50 --ref 5\n" " --subme 8 --trellis 2\n" " - slower:\n" " --b-adapt 2 --direct auto --me umh\n" " --partitions all --rc-lookahead 60\n" " --ref 8 --subme 9 --trellis 2\n" " - veryslow:\n" " --b-adapt 2 --bframes 8 --direct auto\n" " --me umh --merange 24 --partitions all\n" " --ref 16 --subme 10 --trellis 2\n" " --rc-lookahead 60\n" " - placebo:\n" " --bframes 16 --b-adapt 2 --direct auto\n" " --slow-firstpass --no-fast-pskip\n" " --me tesa --merange 24 --partitions all\n" " --rc-lookahead 60 --ref 16 --subme 11\n" " --trellis 2\n" ); else H0( " - ultrafast,superfast,veryfast,faster,fast\n" " - medium,slow,slower,veryslow,placebo\n" ); H0( " --tune Tune the settings for a particular type of source\n" " or situation\n" " Overridden by user settings.\n" " Multiple tunings are separated by commas.\n" " Only one psy tuning can be used at a time.\n" ); H2( " - film (psy tuning):\n" " --deblock -1:-1 --psy-rd :0.15\n" " - animation (psy tuning):\n" " --bframes {+2} --deblock 1:1\n" " --psy-rd 0.4: --aq-strength 0.6\n" " --ref {Double if >1 else 1}\n" " - grain (psy tuning):\n" " --aq-strength 0.5 --no-dct-decimate\n" " --deadzone-inter 6 --deadzone-intra 6\n" " --deblock -2:-2 --ipratio 1.1\n" " --pbratio 1.1 --psy-rd :0.25\n" " --qcomp 0.8\n" " - stillimage (psy tuning):\n" " --aq-strength 1.2 --deblock -3:-3\n" " --psy-rd 2.0:0.7\n" " - psnr (psy tuning):\n" " --aq-mode 0 --no-psy\n" " - ssim (psy tuning):\n" " --aq-mode 2 --no-psy\n" " - fastdecode:\n" " --no-cabac --no-deblock --no-weightb\n" " --weightp 0\n" " - zerolatency:\n" " --bframes 0 --force-cfr --no-mbtree\n" " --sync-lookahead 0 --sliced-threads\n" " --rc-lookahead 0\n" ); else H0( " - psy tunings: film,animation,grain,\n" " stillimage,psnr,ssim\n" " - other tunings: fastdecode,zerolatency\n" ); H2( " --slow-firstpass Don't force these faster settings with --pass 1:\n" " --no-8x8dct --me dia --partitions none\n" " --ref 1 --subme {2 if >2 else unchanged}\n" " --trellis 0 --fast-pskip\n" ); else H1( " --slow-firstpass Don't force faster settings with --pass 1\n" ); H0( "\n" ); H0( "Frame-type options:\n" ); H0( "\n" ); H0( " -I, --keyint Maximum GOP size [%d]\n", defaults->i_keyint_max ); H2( " -i, --min-keyint Minimum GOP size [auto]\n" ); H2( " --no-scenecut Disable adaptive I-frame decision\n" ); H2( " --scenecut How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold ); H2( " --intra-refresh Use Periodic Intra Refresh instead of IDR frames\n" ); H1( " -b, --bframes Number of B-frames between I and P [%d]\n", defaults->i_bframe ); H1( " --b-adapt Adaptive B-frame decision method [%d]\n" " Higher values may lower threading efficiency.\n" " - 0: Disabled\n" " - 1: Fast\n" " - 2: Optimal (slow with high --bframes)\n", defaults->i_bframe_adaptive ); H2( " --b-bias Influences how often B-frames are used [%d]\n", defaults->i_bframe_bias ); H1( " --b-pyramid Keep some B-frames as references [%s]\n" " - none: Disabled\n" " - strict: Strictly hierarchical pyramid\n" " - normal: Non-strict (not Blu-ray compatible)\n", strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) ); H1( " --open-gop Use recovery points to close GOPs\n" " Only available with b-frames\n" ); H1( " --no-cabac Disable CABAC\n" ); H1( " -r, --ref Number of reference frames [%d]\n", defaults->i_frame_reference ); H1( " --no-deblock Disable loop filter\n" ); H1( " -f, --deblock Loop filter parameters [%d:%d]\n", defaults->i_deblocking_filter_alphac0, defaults->i_deblocking_filter_beta ); H2( " --slices Number of slices per frame; forces rectangular\n" " slices and is overridden by other slicing options\n" ); else H1( " --slices Number of slices per frame\n" ); H2( " --slices-max Absolute maximum slices per frame; overrides\n" " slice-max-size/slice-max-mbs when necessary\n" ); H2( " --slice-max-size Limit the size of each slice in bytes\n"); H2( " --slice-max-mbs Limit the size of each slice in macroblocks (max)\n"); H2( " --slice-min-mbs Limit the size of each slice in macroblocks (min)\n"); H0( " --tff Enable interlaced mode (top field first)\n" ); H0( " --bff Enable interlaced mode (bottom field first)\n" ); H2( " --constrained-intra Enable constrained intra prediction.\n" ); H0( " --pulldown Use soft pulldown to change frame rate\n" " - %s (requires cfr input)\n", stringify_names( buf, x264_pulldown_names ) ); H2( " --fake-interlaced Flag stream as interlaced but encode progressive.\n" " Makes it possible to encode 25p and 30p Blu-Ray\n" " streams. Ignored in interlaced mode.\n" ); H2( " --frame-packing For stereoscopic videos define frame arrangement\n" " - 0: checkerboard - pixels are alternatively from L and R\n" " - 1: column alternation - L and R are interlaced by column\n" " - 2: row alternation - L and R are interlaced by row\n" " - 3: side by side - L is on the left, R on the right\n" " - 4: top bottom - L is on top, R on bottom\n" " - 5: frame alternation - one view per frame\n" " - 6: mono - 2D frame without any frame packing\n" " - 7: tile format - L is on top-left, R split across\n" ); H0( "\n" ); H0( "Ratecontrol:\n" ); H0( "\n" ); H1( " -q, --qp Force constant QP (0-%d, 0=lossless)\n", QP_MAX ); H0( " -B, --bitrate Set bitrate (kbit/s)\n" ); H0( " --crf Quality-based VBR (%d-51) [%.1f]\n", 51 - QP_MAX_SPEC, defaults->rc.f_rf_constant ); H1( " --rc-lookahead Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead ); H0( " --vbv-maxrate Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate ); H0( " --vbv-bufsize Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size ); H2( " --vbv-init Initial VBV buffer occupancy [%.1f]\n", defaults->rc.f_vbv_buffer_init ); H2( " --crf-max With CRF+VBV, limit RF to this value\n" " May cause VBV underflows!\n" ); H2( " --qpmin Set min QP [%d]\n", defaults->rc.i_qp_min ); H2( " --qpmax Set max QP [%d]\n", X264_MIN( defaults->rc.i_qp_max, QP_MAX ) ); H2( " --qpstep Set max QP step [%d]\n", defaults->rc.i_qp_step ); H2( " --ratetol Tolerance of ABR ratecontrol and VBV [%.1f]\n", defaults->rc.f_rate_tolerance ); H2( " --ipratio QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor ); H2( " --pbratio QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor ); H2( " --chroma-qp-offset QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset ); H2( " --aq-mode AQ method [%d]\n" " - 0: Disabled\n" " - 1: Variance AQ (complexity mask)\n" " - 2: Auto-variance AQ\n" " - 3: Auto-variance AQ with bias to dark scenes\n", defaults->rc.i_aq_mode ); H1( " --aq-strength Reduces blocking and blurring in flat and\n" " textured areas. [%.1f]\n", defaults->rc.f_aq_strength ); H1( "\n" ); H0( " -p, --pass Enable multipass ratecontrol\n" " - 1: First pass, creates stats file\n" " - 2: Last pass, does not overwrite stats file\n" ); H2( " - 3: Nth pass, overwrites stats file\n" ); H1( " --stats Filename for 2 pass stats [\"%s\"]\n", defaults->rc.psz_stat_out ); H2( " --no-mbtree Disable mb-tree ratecontrol.\n"); H2( " --qcomp QP curve compression [%.2f]\n", defaults->rc.f_qcompress ); H2( " --cplxblur Reduce fluctuations in QP (before curve compression) [%.1f]\n", defaults->rc.f_complexity_blur ); H2( " --qblur Reduce fluctuations in QP (after curve compression) [%.1f]\n", defaults->rc.f_qblur ); H2( " --zones //... Tweak the bitrate of regions of the video\n" ); H2( " Each zone is of the form\n" " ,,