pax_global_header00006660000000000000000000000064140336177070014522gustar00rootroot0000000000000052 comment=c86c6418ea6c827513d206694847033f9ca50151 jpeg-quantsmooth-1.20210408/000077500000000000000000000000001403361770700154305ustar00rootroot00000000000000jpeg-quantsmooth-1.20210408/AUTHORS000066400000000000000000000001541403361770700165000ustar00rootroot00000000000000 ### Ilya Kurdyukov https://github.com/ilyakurdyukov https://www.linkedin.com/in/ilya-kurdyukov-a7304119b/ jpeg-quantsmooth-1.20210408/LICENSE000066400000000000000000000636361403361770700164530ustar00rootroot00000000000000 GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! jpeg-quantsmooth-1.20210408/Makefile000066400000000000000000000123661403361770700171000ustar00rootroot00000000000000 SRCDEPS := quantsmooth.h idct.h libjpegqs.h SRCNAME ?= quantsmooth.c ifeq ($(SRCNAME),jpegqs-mini.c) APPNAME ?= jpegqs-mini $(APPNAME): Makefile else ifeq ($(SRCNAME),example.c) APPNAME ?= example else APPNAME ?= jpegqs endif $(APPNAME): Makefile $(SRCDEPS) endif SIMD := native # machine flags MFLAGS := SIMDFLG := SIMDOBJ := ifeq ($(SIMD),select) SIMDOBJ := jpegqs_base.o jpegqs_sse2.o jpegqs_avx2.o jpegqs_avx512.o else ifeq ($(SIMD),none) SIMDFLG := -DNO_SIMD else ifeq ($(SIMD),native) SIMDFLG := -march=native else ifeq ($(SIMD),avx512) SIMDFLG := -mavx512f -mfma else ifeq ($(SIMD),avx2) SIMDFLG := -mavx2 -mfma else ifeq ($(SIMD),sse2) SIMDFLG := -msse2 endif # multithreading options MTOPTS := -fopenmp # path to save "libgomp.a" LIBMINIOMP := CFLAGS := -Wall -O2 LDFLAGS := -Wl,--gc-sections -s CFLAGS_LIB := $(CFLAGS) $(MFLAGS) $(SIMDFLG) CFLAGS_APP := $(CFLAGS_LIB) -Wextra -pedantic $(MTOPTS) ifeq ($(SIMD),select) CFLAGS_APP += -DSIMD_SELECT endif .PHONY: clean all app lib app: $(APPNAME) all: app lib lib: lib$(APPNAME).a WGET_CMD = @echo "run make with WGET_CMD=wget to allow file downloads" ; echo "DISABLED:" wget jpegsrc.v%.tar.gz: $(WGET_CMD) -O $@ "https://www.ijg.org/files/$@" test -f $@ jpeg-%/jutils.c: jpegsrc.v%.tar.gz tar -xzf jpegsrc.v$(patsubst jpeg-%/jutils.c,%,$@).tar.gz touch $@ jpeg-%/Makefile: jpeg-%/jutils.c cd $(patsubst %/Makefile,%,$@) && ./configure jpeg-%/libjpeg.a: jpeg-%/Makefile cd $(patsubst %/libjpeg.a,%,$@) && $(MAKE) all && test -d .libs && cp .libs/libjpeg.a . || true .PRECIOUS: jpegsrc.v%.tar.gz jpeg-%/jutils.c jpeg-%/Makefile libjpeg-turbo-%.tar.gz: $(WGET_CMD) -O $@ "https://sourceforge.net/projects/libjpeg-turbo/files/$(patsubst libjpeg-turbo-%.tar.gz,%,$@)/libjpeg-turbo-$(patsubst libjpeg-turbo-%.tar.gz,%,$@).tar.gz" test -f $@ libjpeg-turbo-%/jutils.c: libjpeg-turbo-%.tar.gz tar -xzf $(patsubst %/jutils.c,%,$@).tar.gz touch $@ .PRECIOUS: libjpeg-turbo-%.tar.gz libjpeg-turbo-%/jutils.c libjpeg-turbo-1.%/Makefile: libjpeg-turbo-1.%/jutils.c cd $(patsubst %/Makefile,%,$@) && ./configure libjpeg-turbo-1.%/libjpeg.a: libjpeg-turbo-1.%/Makefile cd $(patsubst %/libjpeg.a,%,$@) && $(MAKE) all && cp .libs/lib*jpeg.a . .PRECIOUS: libjpeg-turbo-1.%/Makefile libjpeg-turbo-2.%/.libs/Makefile: libjpeg-turbo-2.%/jutils.c mkdir -p $(patsubst %/Makefile,%,$@) cd $(patsubst %/Makefile,%,$@) && cmake -G"Unix Makefiles" .. libjpeg-turbo-2.%/libjpeg.a: libjpeg-turbo-2.%/.libs/Makefile cd $(patsubst %/Makefile,%,$<) && $(MAKE) all && cp jconfig*.h lib*jpeg.a .. .PRECIOUS: libjpeg-turbo-2.%/.libs/Makefile ifeq ($(JPEGSRC),) JPEGLIB ?= -ljpeg JPEGLIB2 := $(JPEGLIB) CFLAGS_APP += $(filter -I%,$(JPEGLIB)) OBJLIST := else OBJDIR ?= $(JPEGSRC) ALLSRC := $(patsubst $(JPEGSRC)/%.c,%,$(wildcard $(JPEGSRC)/*.c)) SOURCES := jutils jmemmgr jmemnobs jcomapi jerror \ jdapimin jdcoefct jdmarker jdhuff jdinput jdtrans \ jcapimin jcmaster jcmarker jchuff jcparam jctrans \ rdswitch cdjpeg transupp jdatasrc jdatadst ifeq ($(SRCNAME),jpegqs-mini.c) SOURCES += jidctint jfdctint else ifeq ($(SRCNAME),example.c) SOURCES += jidctint jidctfst jidctflt jquant1 jquant2 \ jdapistd jdmaster jdcolor jdpostct jddctmgr jdsample jdmerge jdmainct SOURCES += $(filter jidctred,$(ALLSRC)) endif # version specific sources SOURCES += $(filter jdphuff jcphuff jaricom jdarith jcarith,$(ALLSRC)) OBJLIST := $(patsubst %,$(OBJDIR)/%.o,$(SOURCES)) CFLAGS_APP += -DWITH_JPEGSRC -I$(JPEGSRC) -I. $(OBJDIR)/%.o: $(JPEGSRC)/%.c $(CC) $(CFLAGS_LIB) -I$(JPEGSRC) -I. -c -o $@ $< JPEGLIB2 := $(OBJLIST) $(APPNAME): $(OBJLIST) endif clean: rm -f $(APPNAME) $(OBJLIST) jpegqs_*.o libjpegqs*.o lib$(APPNAME).a miniomp.o $(LIBMINIOMP) ifneq ($(LIBMINIOMP),) JPEGLIB2 += -L$(dir $(LIBMINIOMP)) $(APPNAME): $(LIBMINIOMP) endif $(APPNAME): $(SRCNAME) $(SIMDOBJ) $(CC) $(CFLAGS_APP) -DAPPNAME=$(APPNAME) -o $@ $< $(JPEGLIB2) $(SIMDOBJ) $(LDFLAGS) -lm ifeq ($(SRCNAME),example.c) SIMDSEL_FLAGS ?= else SIMDSEL_FLAGS ?= -DTRANSCODE_ONLY -DWITH_LOG endif jpegqs_avx512.o: libjpegqs.c $(SRCDEPS) $(CC) $(SIMDSEL_FLAGS) -DSIMD_NAME=avx512 -mavx512f -mfma $(CFLAGS_APP) -DSIMD_AVX512 -c -o $@ $< jpegqs_avx2.o: libjpegqs.c $(SRCDEPS) $(CC) $(SIMDSEL_FLAGS) -DSIMD_NAME=avx2 -mavx2 -mfma $(CFLAGS_APP) -DSIMD_AVX2 -c -o $@ $< jpegqs_sse2.o: libjpegqs.c $(SRCDEPS) $(CC) $(SIMDSEL_FLAGS) -DSIMD_NAME=sse2 -msse2 $(CFLAGS_APP) -DSIMD_SSE2 -c -o $@ $< jpegqs_base.o: libjpegqs.c $(SRCDEPS) $(CC) $(SIMDSEL_FLAGS) -DSIMD_NAME=base $(CFLAGS_APP) -DSIMD_BASE -c -o $@ $< ifeq ($(SIMD),select) lib$(APPNAME).a: libjpegqs_base.o libjpegqs_sse2.o libjpegqs_avx2.o libjpegqs_avx512.o endif lib$(APPNAME).a: libjpegqs.o $(AR) -rsc $@ $^ libjpegqs.o: libjpegqs.c $(SRCDEPS) $(CC) $(CFLAGS_APP) -c -o $@ $< libjpegqs_avx512.o: libjpegqs.c $(SRCDEPS) $(CC) -DSIMD_NAME=avx512 -mavx512f -mfma $(CFLAGS_APP) -DSIMD_AVX512 -c -o $@ $< libjpegqs_avx2.o: libjpegqs.c $(SRCDEPS) $(CC) -DSIMD_NAME=avx2 -mavx2 -mfma $(CFLAGS_APP) -DSIMD_AVX2 -c -o $@ $< libjpegqs_sse2.o: libjpegqs.c $(SRCDEPS) $(CC) -DSIMD_NAME=sse2 -msse2 $(CFLAGS_APP) -DSIMD_SSE2 -c -o $@ $< libjpegqs_base.o: libjpegqs.c $(SRCDEPS) $(CC) -DSIMD_NAME=base $(CFLAGS_APP) -DSIMD_BASE -c -o $@ $< $(LIBMINIOMP): miniomp.o $(AR) -rsc $@ $^ miniomp.o: miniomp.c $(CC) -DOVERFLOW_CHECKS=0 -O2 -Wall -Wextra -c -o $@ $< -ffunction-sections -fdata-sections jpeg-quantsmooth-1.20210408/README.md000066400000000000000000000166451403361770700167230ustar00rootroot00000000000000# JPEG Quant Smooth This program tries to recover the lost precision of DCT coefficients based on a quantization table from a JPEG image. The result is saved as a JPEG image with quantization set to 1 (like a JPEG saved at 100% quality). You may not notice jpeg artifacts on the screen without zooming in, but you may notice them after printing. Also, when editing compressed images, artifacts can accumulate, but if you use this program before editing - the result will be better. * The original project page is [here](https://github.com/ilyakurdyukov/jpeg-quantsmooth). * You can save a smoothed image with the original quantization tables, which will result in the same DCT coefficients as in the original image. (Note: chroma will be slightly different if upsampling is used at quality level 6) * Since this program uses quantization tables to recreate DCT coefficients, applying it to JPEG images that have been re-saved multiple times may not produce good results, as it can recover only from the last JPEG recoding, and not from the previous ones. ## WebAssembly Web version available [here](https://ilyakurdyukov.github.io/jpeg-quantsmooth/). Images are processed locally on your computer. Without multithreading and SIMD optimizations it runs slower than native code. - Click the "Load" button or drag-n-drop JPEG image into the browser window. After processing is complete, you can save the result by clicking the "Save" button. You can edit the *filename* field before saving. The *options* field is passed to the *wasm* code when you initiate a processing by loading a file. ## Usage `jpegqs [options] input.jpg output.jpg` ## Options `-q, --quality n` Quality setting (0-6, default is 3) `-n, --niter n` Number of iterations (default is 3) `-t, --threads n` Set the number of CPU threads to use `-o, --optimize` Option for libjpeg to produce smaller output file `-v, --verbose n` Print libjpeg debug output `-i, --info n` Print quantsmooth debug output (default is 15) Use the sum of flags: 0 - silent, 1/2/4 - various information, 8 - processing time, 16 - SIMD type. `-p, --cpu n` Use to lower the SIMD type if CPU detection fails: 0 - auto, 1 - scalar, 2 - SSE2, 3 - AVX2, 4 - AVX512. (`x86` build selects between modes 1-3, `x86_64` from 2-4) - The processing time includes only the smoothing algorithm, jpeg reading and writing time is not included. - More iterations can make the result look like CG art, can make the photos look unnatural. The quality setting sets a combination of flags for processing: 3. default 4. adds `DIAGONALS` flag smoother diagonal edges, ~1.5 times slower 5. adds `JOINT_YUV` flag chroma channels will depend from luminance, better color consistency 6. adds `UPSAMPLE_UV` flag non-blurring chroma upsampling, unlike `fancy upsampling` from *libjpeg* - levels 0-2 is the same as 4-6, but with `LOW_QUALITY` flag ~10 times faster, but the quality is lower `LOW_QUALITY` implies `DIAGONALS` (always set) ## Examples - Images 3x zoomed.

Original images:

JPEG with quality increasing from 8% to 98%:

After processing:

## Buliding on Linux If your system have *libjpeg* development package installed, just type `make`. Tested with `libjpeg-turbo8-dev` package from Ubuntu-18.04. ### Building for Linux distribution Use the `SIMD=select` switch, so that `jpegqs` is compiled for different CPU vector extensions. *amd64* application: `make SIMD=select MFLAGS="-m64" clean app` *i386* application: `make SIMD=select MFLAGS="-m32 -march=i386" clean app` Package dependencies: libc, libjpeg, openmp ### Building with libjpeg sources 1. Download and extract *libjpeg* sources: 1. *libjpeg*, for example version 6b `wget https://www.ijg.org/files/jpegsrc.v6b.tar.gz` `tar -xzf jpegsrc.v6b.tar.gz` 2. *libjpeg-turbo*, for example version 2.0.4 `wget -O libjpeg-turbo-2.0.4.tar.gz https://sourceforge.net/projects/libjpeg-turbo/files/2.0.4/libjpeg-turbo-2.0.4.tar.gz` `tar -xzf libjpeg-turbo-2.0.4.tar.gz` - For a *libjpeg* (not *turbo*) you can build `jpegqs` in a simpler way: `make JPEGSRC=jpeg-6b` This uses static configuration from `jconfig.h`, which should work for common systems. The following items are not needed if you do so. 2. Configure and build *libjpeg*: 1. For *libjpeg* and *libjpeg-turbo-1.x.x*: `(cd jpeg-6b && ./configure && make all)` 2. For *libjpeg-turbo-2.x.x* `./configure` script is replaced with `cmake`: `(cd libjpeg-turbo-2.0.4 && mkdir -p .libs && (cd .libs && cmake -G"Unix Makefiles" .. && make all))` 3. Tell `make` where to find *libjpeg* includes and `libjpeg.a` `make JPEGLIB="-Ijpeg-6b jpeg-6b/libjpeg.a` For a newer versions `libjpeg.a` is located in a `.libs/` dir. ### libjpeg build helper The `jpegqs` makefile can download sources, extract and compile `libjpeg` for you. Replace `%VER%` with a version. - libjpeg: `make jpeg-%VER%/libjpeg.a` Tested versions: 6b, 7, 8d, 9c - libjpeg-turbo:`make libjpeg-turbo-%VER%/libjpeg.a` Tested versions: 1.0.0, 1.4.2, 1.5.3, 2.0.4 It will print you link to archive which you need to download, or you can allow the downloads by adding `WGET_CMD=wget` to the `make` command line. ## Building on Windows Get [MSYS2](https://www.msys2.org/), install needed packages with pacman and build with __release.sh__. If you are not familiar with building unix applications on windows, then you can download program from [releases](https://github.com/ilyakurdyukov/jpeg-quantsmooth/releases). ## Use as a library Can be easily added to other software that uses `libjpeg` to read JPEG images. 1. Find the source that uses `jpeg_start_decompress` and `jpeg_finish_decompress`. 2. Add include, either `quantsmooth.h` (compile jpegqs as inline) or `libjpegqs.h` (link to the jpegqs library). 3. Change `jpeg_` to `jpegqs_` for these two functions. 4. Calling `jpegqs_start_decompress` takes an additional argument with options, see `example.c` for how to use it. - Build `libjpegqs.a` static library with `make SIMD=select lib` ## Alternatives and comparison Similar projects, and how I see them after some testing. [**jpeg2png**](https://github.com/victorvde/jpeg2png):  ✔️ good documentation and math model  ✔️ has tuning options  ✔️ better at deblocking low quality JPEG images  ❓ has an overblurring (`-w 0.0` switch makes the result a little sharper, but doesn't fix it)  ➖ 10 to 20 times slower  ➖ less permissive license (GPL-3.0) **jpeg2png** can provide roughly same quality (better in not common cases), but significantly slower. [**knusperli**](https://github.com/google/knusperli):  ✔️ more permissive license (Apache-2.0)  ➖ you can hardly see any improvements on the image  ➖ no performance optimizations (but roughly same speed as for quantsmooth with optimizations)  ➖ no any command line options  ➖ uncommon build system **knusperli** is good for nothing, in my opinion. jpeg-quantsmooth-1.20210408/example.c000066400000000000000000000116461403361770700172370ustar00rootroot00000000000000/* * Copyright (C) 2016-2020 Ilya Kurdyukov * * This file is part of jpeg quantsmooth (example) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * This is an example of how to process a JPEG image using Quant Smooth * and get the RGB pixel data without saving the coefficients back to JPEG. * For those who want to make a library or plugin. */ #include #include #include #include #include #ifdef WITH_JPEGSRC #define JPEG_INTERNALS #endif #include "jpeglib.h" // use "libjpegqs.h" for linking with library #include "quantsmooth.h" typedef struct { int width, height, bpp, stride; uint8_t *data; } bitmap_t; static bitmap_t *bitmap_create(int width, int height, int bpp) { bitmap_t *bm; // BMP needs 4-byte row alignment int stride = (width * bpp + 3) & -4; uint64_t size = (int64_t)stride * height + sizeof(bitmap_t); // check for overflow if ((unsigned)((width - 1) | (height - 1)) >= 0x10000 || (uint64_t)(size_t)size != size) return NULL; bm = (bitmap_t*)malloc(size); if (!bm) return bm; bm->width = width; bm->height = height; bm->bpp = bpp; bm->stride = stride; bm->data = (uint8_t*)(bm + 1); return bm; } static void bitmap_free(bitmap_t *in) { if (in) free(in); } typedef struct { struct jpeg_error_mgr pub; jmp_buf setjmp_buffer; } bitmap_jpeg_err_ctx; static void bitmap_jpeg_err(j_common_ptr cinfo) { char errorMsg[JMSG_LENGTH_MAX]; bitmap_jpeg_err_ctx* jerr = (bitmap_jpeg_err_ctx*)cinfo->err; (*(cinfo->err->format_message))(cinfo, errorMsg); fprintf(stderr, "%s\n", errorMsg); longjmp(jerr->setjmp_buffer, 1); } bitmap_t* bitmap_read_jpeg(const char *filename, jpegqs_control_t *opts) { struct jpeg_decompress_struct ci; FILE * volatile fp; int volatile ok = 0; bitmap_t * volatile bm = NULL; void * volatile mem = NULL; bitmap_jpeg_err_ctx jerr; ci.err = jpeg_std_error(&jerr.pub); jerr.pub.error_exit = bitmap_jpeg_err; if (!setjmp(jerr.setjmp_buffer)) do { unsigned x, y, w, h, st; uint8_t *data; bitmap_t *bm1; JSAMPROW *scanline; jpeg_create_decompress(&ci); if (!(fp = fopen(filename, "rb"))) break; jpeg_stdio_src(&ci, fp); jpeg_read_header(&ci, TRUE); ci.out_color_space = JCS_RGB; jpegqs_start_decompress(&ci, opts); w = ci.output_width; h = ci.output_height; bm = bm1 = bitmap_create(w, h, 3); if (!bm1) break; mem = scanline = (JSAMPROW*)malloc(h * sizeof(JSAMPROW)); if (!scanline) break; st = bm1->stride; data = bm1->data; // BMP uses reverse row order for (y = 0; y < h; y++) scanline[y] = (JSAMPLE*)(data + (h - 1 - y) * st); while ((y = ci.output_scanline) < h) jpeg_read_scanlines(&ci, scanline + y, h - y); // need to convert RGB to BGR for BMP for (y = 0; y < h; y++) { JSAMPLE *p = data + y * st, t; for (x = 0; x < w * 3; x += 3) { t = p[x]; p[x] = p[x + 2]; p[x + 2] = t; } for (; x < st; x++) p[x] = 0; } ok = 1; jpegqs_finish_decompress(&ci); } while (0); if (mem) free(mem); if (!ok) { bitmap_free(bm); bm = NULL; } jpeg_destroy_decompress(&ci); if (fp) fclose(fp); return bm; } typedef struct { int init; } progress_data_t; static int progress(void *data, int cur, int max) { progress_data_t *prog = (progress_data_t*)data; printf("%s%i%%", prog->init ? ", " : "progress: ", 100 * cur / max); fflush(stdout); prog->init = 1; // return nonzero value to stop processing return 0; } int main(int argc, char **argv) { bitmap_t *bm; const char *ifn, *ofn; jpegqs_control_t opts; progress_data_t prog; if (argc != 3) { printf("Usage: example input.jpg output.bmp\n"); return 1; } ifn = argv[1]; ofn = argv[2]; memset(&opts, 0, sizeof(opts)); opts.niter = 3; opts.flags |= JPEGQS_DIAGONALS; /* -q4 */ opts.flags |= JPEGQS_JOINT_YUV; /* -q5 */ opts.flags |= JPEGQS_UPSAMPLE_UV; /* -q6 */ prog.init = 0; opts.userdata = &prog; opts.progress = progress; bm = bitmap_read_jpeg(ifn, &opts); if (prog.init) printf("\n"); if (bm) { FILE *f; int w = bm->width, h = bm->height, st = bm->stride; int32_t n = 54, bmp[] = { 0x4d420000, -1, 0, 54, 40, -1, -1, (24 << 16) + 1, 0, -1, 2835, 2835, 0, 0 }; bmp[1] = n + h * st; bmp[5] = w; bmp[6] = h; bmp[9] = h * st; if ((f = fopen(ofn, "wb"))) { n = fwrite((uint8_t*)bmp + 2, 1, n, f); n = fwrite(bm->data, 1, h * st, f); fclose(f); } bitmap_free(bm); } return bm ? 0 : 1; } jpeg-quantsmooth-1.20210408/idct.h000066400000000000000000000464671403361770700165450ustar00rootroot00000000000000/* * idct/fdct SSE2/AVX2/NEON intrinsic optimizations: * Copyright (C) 2016-2020 Ilya Kurdyukov * * contains modified parts of libjpeg: * Copyright (C) 1991-1998, Thomas G. Lane * * This file is part of jpeg quantsmooth * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ static const char jpegqs_natural_order[DCTSIZE2] = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; static JSAMPLE range_limit_static[CENTERJSAMPLE * 8]; #ifndef USE_JSIMD #define CONST_BITS 13 #define PASS1_BITS 2 #define FIX_0_298631336 2446 /* FIX(0.298631336) */ #define FIX_0_390180644 3196 /* FIX(0.390180644) */ #define FIX_0_541196100 4433 /* FIX(0.541196100) */ #define FIX_0_765366865 6270 /* FIX(0.765366865) */ #define FIX_0_899976223 7373 /* FIX(0.899976223) */ #define FIX_1_175875602 9633 /* FIX(1.175875602) */ #define FIX_1_501321110 12299 /* FIX(1.501321110) */ #define FIX_1_847759065 15137 /* FIX(1.847759065) */ #define FIX_1_961570560 16069 /* FIX(1.961570560) */ #define FIX_2_053119869 16819 /* FIX(2.053119869) */ #define FIX_2_562915447 20995 /* FIX(2.562915447) */ #define FIX_3_072711026 25172 /* FIX(3.072711026) */ #define DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) #define RANGE_MASK (MAXJSAMPLE * 4 + 3) /* 2 bits wider than legal samples */ static void idct_islow(JCOEFPTR coef_block, JSAMPROW outptr, JDIMENSION stride) { #define M3 \ z2 = M1(2); z3 = M1(6); \ z1 = MUL(ADD(z2, z3), SET1(FIX_0_541196100)); \ tmp2 = SUB(z1, MUL(z3, SET1(FIX_1_847759065))); \ tmp3 = ADD(z1, MUL(z2, SET1(FIX_0_765366865))); \ z2 = M1(0); z3 = M1(4); \ tmp0 = SHL(ADD(z2, z3), CONST_BITS); \ tmp1 = SHL(SUB(z2, z3), CONST_BITS); \ tmp10 = ADD(tmp0, tmp3); tmp13 = SUB(tmp0, tmp3); \ tmp11 = ADD(tmp1, tmp2); tmp12 = SUB(tmp1, tmp2); \ tmp0 = M1(7); tmp1 = M1(5); tmp2 = M1(3); tmp3 = M1(1); \ z1 = ADD(tmp0, tmp3); z2 = ADD(tmp1, tmp2); \ z3 = ADD(tmp0, tmp2); z4 = ADD(tmp1, tmp3); \ z5 = MUL(ADD(z3, z4), SET1(FIX_1_175875602)); \ tmp0 = MUL(tmp0, SET1(FIX_0_298631336)); \ tmp1 = MUL(tmp1, SET1(FIX_2_053119869)); \ tmp2 = MUL(tmp2, SET1(FIX_3_072711026)); \ tmp3 = MUL(tmp3, SET1(FIX_1_501321110)); \ z1 = MUL(z1, SET1(FIX_0_899976223)); \ z2 = MUL(z2, SET1(FIX_2_562915447)); \ z3 = MUL(z3, SET1(FIX_1_961570560)); \ z4 = MUL(z4, SET1(FIX_0_390180644)); \ z3 = SUB(z5, z3); z4 = SUB(z5, z4); \ tmp0 = ADD(tmp0, SUB(z3, z1)); \ tmp1 = ADD(tmp1, SUB(z4, z2)); \ tmp2 = ADD(tmp2, SUB(z3, z2)); \ tmp3 = ADD(tmp3, SUB(z4, z1)); \ M2(0, ADD(tmp10, tmp3)) M2(7, SUB(tmp10, tmp3)) \ M2(1, ADD(tmp11, tmp2)) M2(6, SUB(tmp11, tmp2)) \ M2(2, ADD(tmp12, tmp1)) M2(5, SUB(tmp12, tmp1)) \ M2(3, ADD(tmp13, tmp0)) M2(4, SUB(tmp13, tmp0)) #if 1 && defined(USE_NEON) int ctr; int32x4_t *wsptr, workspace[DCTSIZE2] ALIGN(16); int32x4_t tmp0, tmp1, tmp2, tmp3; int32x4_t tmp10, tmp11, tmp12, tmp13; int32x4_t z1, z2, z3, z4, z5; #define ADD vaddq_s32 #define SUB vsubq_s32 #if 0 static const int32_t tab[12] = { FIX_0_298631336, FIX_0_390180644, FIX_0_541196100, FIX_0_765366865, FIX_0_899976223, FIX_1_175875602, FIX_1_501321110, FIX_1_847759065, FIX_1_961570560, FIX_2_053119869, FIX_2_562915447, FIX_3_072711026 }; int32x4_t t1 = vld1q_s32(tab), t2 = vld1q_s32(tab + 4), t3 = vld1q_s32(tab + 8); #define IDCT_FIX_0_298631336 vget_low_s32(t1), 0 #define IDCT_FIX_0_390180644 vget_low_s32(t1), 1 #define IDCT_FIX_0_541196100 vget_high_s32(t1), 0 #define IDCT_FIX_0_765366865 vget_high_s32(t1), 1 #define IDCT_FIX_0_899976223 vget_low_s32(t2), 0 #define IDCT_FIX_1_175875602 vget_low_s32(t2), 1 #define IDCT_FIX_1_501321110 vget_high_s32(t2), 0 #define IDCT_FIX_1_847759065 vget_high_s32(t2), 1 #define IDCT_FIX_1_961570560 vget_low_s32(t3), 0 #define IDCT_FIX_2_053119869 vget_low_s32(t3), 1 #define IDCT_FIX_2_562915447 vget_high_s32(t3), 0 #define IDCT_FIX_3_072711026 vget_high_s32(t3), 1 #define MUL(a, b) vmulq_lane_s32(a, b) #define SET1(a) IDCT_##a #else #define MUL vmulq_s32 #define SET1 vdupq_n_s32 #endif #define SHL vshlq_n_s32 wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr += 4, wsptr += 4) { #define M1(i) vmovl_s16(vld1_s16((int16_t*)&coef_block[DCTSIZE*i+ctr])) #define M2(i, tmp) wsptr[(i&3)+(i&4)*2] = vrshrq_n_s32(tmp, CONST_BITS-PASS1_BITS); M3 #undef M1 #undef M2 } wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr += 4, wsptr += 8, outptr += 4*stride) { int32x4x4_t q0 = vld4q_s32((int32_t*)&wsptr[0]), q1 = vld4q_s32((int32_t*)&wsptr[4]); #define M1(i, n) int32x4_t x##i = q##n.val[i & 3]; M1(0, 0) M1(1, 0) M1(2, 0) M1(3, 0) M1(4, 1) M1(5, 1) M1(6, 1) M1(7, 1) #undef M1 #define M1(i) x##i #define M2(i, tmp) x##i = tmp; M3 #undef M1 #undef M2 { int8x8_t t0 = vdup_n_s8(-128); #define M1(i, j) int8x8_t v##i = vqrshrn_n_s16(vuzpq_s16( \ vreinterpretq_s16_s32(x##i), vreinterpretq_s16_s32(x##j)).val[1], \ CONST_BITS+PASS1_BITS+3-16); M1(0, 4) M1(1, 5) M1(2, 6) M1(3, 7) #undef M1 int16x4x2_t p0, p1; int8x8x2_t p2, p3; p0 = vtrn_s16(vreinterpret_s16_s8(v0), vreinterpret_s16_s8(v2)); p1 = vtrn_s16(vreinterpret_s16_s8(v1), vreinterpret_s16_s8(v3)); p2 = vtrn_s8(vreinterpret_s8_s16(p0.val[0]), vreinterpret_s8_s16(p1.val[0])); p3 = vtrn_s8(vreinterpret_s8_s16(p0.val[1]), vreinterpret_s8_s16(p1.val[1])); #define M1(i, p2, j) vst1_s8((int8_t*)&outptr[stride*i], veor_s8(p2.val[j], t0)); M1(0, p2, 0); M1(1, p2, 1); M1(2, p3, 0); M1(3, p3, 1); #undef M1 } } #elif 1 && defined(USE_AVX2) __m256i v0, v1, v2, v3, v4, v5, v6, v7, t0, t1, x0, x1, x2, x3, x4, x5, x6, x7; __m256i tmp0, tmp1, tmp2, tmp3; __m256i tmp10, tmp11, tmp12, tmp13; __m256i z1, z2, z3, z4, z5; #define ADD _mm256_add_epi32 #define SUB _mm256_sub_epi32 #define MUL _mm256_mullo_epi32 #define SET1 _mm256_set1_epi32 #define SHL _mm256_slli_epi32 #define M1(i) _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)&coef_block[DCTSIZE*i])) #define M2(i, tmp) x##i = _mm256_srai_epi32(ADD(tmp, t0), CONST_BITS-PASS1_BITS); t0 = SET1(1 << (CONST_BITS-PASS1_BITS-1)); M3 #undef M1 #undef M2 #define M2(v0, v1, v2, v3, k) \ v0 = _mm256_permute2x128_si256(x0, x4, k); \ v1 = _mm256_permute2x128_si256(x1, x5, k); \ v2 = _mm256_permute2x128_si256(x2, x6, k); \ v3 = _mm256_permute2x128_si256(x3, x7, k); M2(v0, v1, v2, v3, 0x20) M2(v4, v5, v6, v7, 0x31) #undef M2 #define M4(v0, v1, v2, v3, x0, x1, x2, x3) \ t0 = _mm256_unpacklo_epi32(v0, v2); \ t1 = _mm256_unpacklo_epi32(v1, v3); \ x0 = _mm256_unpacklo_epi32(t0, t1); \ x1 = _mm256_unpackhi_epi32(t0, t1); \ t0 = _mm256_unpackhi_epi32(v0, v2); \ t1 = _mm256_unpackhi_epi32(v1, v3); \ x2 = _mm256_unpacklo_epi32(t0, t1); \ x3 = _mm256_unpackhi_epi32(t0, t1); M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #define M1(i) x##i #define M2(i, tmp) v##i = _mm256_srai_epi32(ADD(tmp, t0), CONST_BITS+PASS1_BITS+3); t0 = SET1((256+1) << (CONST_BITS+PASS1_BITS+3-1)); M3 #undef M1 #undef M2 M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #undef M4 x0 = _mm256_packs_epi32(x0, x1); x1 = _mm256_packs_epi32(x2, x3); x4 = _mm256_packs_epi32(x4, x5); x5 = _mm256_packs_epi32(x6, x7); x0 = _mm256_packus_epi16(x0, x1); x4 = _mm256_packus_epi16(x4, x5); v0 = _mm256_unpacklo_epi32(x0, x4); v1 = _mm256_unpackhi_epi32(x0, x4); #if 1 && defined(__x86_64__) #define M1(i, v0, j) *(int64_t*)&outptr[stride*i] = _mm256_extract_epi64(v0, j); M1(0, v0, 0) M1(1, v0, 1) M1(2, v1, 0) M1(3, v1, 1) M1(4, v0, 2) M1(5, v0, 3) M1(6, v1, 2) M1(7, v1, 3) #undef M1 #else #define M1(i, v0, l) _mm_store##l##_pd((double*)&outptr[i*stride], _mm_castsi128_pd(_mm256_castsi256_si128(v0))); #define M2(i, v0, l) _mm_store##l##_pd((double*)&outptr[i*stride], _mm_castsi128_pd(_mm256_extracti128_si256(v0, 1))); M1(0, v0, l) M1(1, v0, h) M1(2, v1, l) M1(3, v1, h) M2(4, v0, l) M2(5, v0, h) M2(6, v1, l) M2(7, v1, h) #undef M2 #undef M1 #endif #elif 1 && defined(USE_SSE2) int ctr; __m128i *wsptr, workspace[DCTSIZE2] ALIGN(16); __m128i v0, v1, v2, v3, v4, v5, v6, v7, t0, t1, x0, x1, x2, x3, x4, x5, x6, x7; __m128i tmp0, tmp1, tmp2, tmp3; __m128i tmp10, tmp11, tmp12, tmp13; __m128i z1, z2, z3, z4, z5; #define ADD _mm_add_epi32 #define SUB _mm_sub_epi32 #if BITS_IN_JSAMPLE == 8 && !defined(USE_SSE4) #define MUL _mm_madd_epi16 #define SET1(a) _mm_set1_epi32(a & 0xffff) #else #define MUL _mm_mullo_epi32 #define SET1 _mm_set1_epi32 #endif #define SHL _mm_slli_epi32 t0 = _mm_set1_epi32(1 << (CONST_BITS-PASS1_BITS-1)); wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr += 4, wsptr += 4) { #define M1(i) _mm_cvtepi16_epi32(_mm_loadl_epi64((__m128i*)&coef_block[DCTSIZE*i+ctr])) #define M2(i, tmp) wsptr[(i&3)+(i&4)*2] = _mm_srai_epi32(ADD(tmp, t0), CONST_BITS-PASS1_BITS); M3 #undef M1 #undef M2 } wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr += 4, wsptr += 8, outptr += 4*stride) { #define M1(i) v##i = wsptr[i]; M1(0) M1(1) M1(2) M1(3) M1(4) M1(5) M1(6) M1(7) #undef M1 #define M4(v0, v1, v2, v3, x0, x1, x2, x3) \ t0 = _mm_unpacklo_epi32(v0, v2); \ t1 = _mm_unpacklo_epi32(v1, v3); \ x0 = _mm_unpacklo_epi32(t0, t1); \ x1 = _mm_unpackhi_epi32(t0, t1); \ t0 = _mm_unpackhi_epi32(v0, v2); \ t1 = _mm_unpackhi_epi32(v1, v3); \ x2 = _mm_unpacklo_epi32(t0, t1); \ x3 = _mm_unpackhi_epi32(t0, t1); M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #define M1(i) x##i #define M2(i, tmp) v##i = _mm_srai_epi32(ADD(tmp, t0), CONST_BITS+PASS1_BITS+3); t0 = _mm_set1_epi32((256+1) << (CONST_BITS+PASS1_BITS+3-1)); M3 #undef M1 #undef M2 M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #undef M4 x0 = _mm_packs_epi32(x0, x4); x1 = _mm_packs_epi32(x1, x5); x2 = _mm_packs_epi32(x2, x6); x3 = _mm_packs_epi32(x3, x7); v0 = _mm_packus_epi16(x0, x1); v1 = _mm_packus_epi16(x2, x3); #define M1(i, v0, l) _mm_store##l##_pd((double*)&outptr[i*stride], _mm_castsi128_pd(v0)); M1(0, v0, l) M1(1, v0, h) M1(2, v1, l) M1(3, v1, h) #undef M1 } #else #define NEED_RANGELIMIT int32_t tmp0, tmp1, tmp2, tmp3; int32_t tmp10, tmp11, tmp12, tmp13; int32_t z1, z2, z3, z4, z5; JCOEFPTR inptr = coef_block; JSAMPLE *range_limit = range_limit_static; int ctr; int32_t *wsptr, workspace[DCTSIZE2]; /* buffers data between passes */ #define ADD(a, b) ((a) + (b)) #define SUB(a, b) ((a) - (b)) #define MUL(a, b) ((a) * (b)) #define SET1(a) (a) #define SHL(a, b) ((a) << (b)) #define M1(i) inptr[DCTSIZE*i] #define M2(i, tmp) wsptr[DCTSIZE*i] = DESCALE(tmp, CONST_BITS-PASS1_BITS); wsptr = workspace; for (ctr = DCTSIZE; ctr > 0; ctr--, inptr++, wsptr++) { if (!(M1(1) | M1(2) | M1(3) | M1(4) | M1(5) | M1(6) | M1(7))) { /* AC terms all zero */ int dcval = SHL(M1(0), PASS1_BITS); wsptr[DCTSIZE*0] = dcval; wsptr[DCTSIZE*1] = dcval; wsptr[DCTSIZE*2] = dcval; wsptr[DCTSIZE*3] = dcval; wsptr[DCTSIZE*4] = dcval; wsptr[DCTSIZE*5] = dcval; wsptr[DCTSIZE*6] = dcval; wsptr[DCTSIZE*7] = dcval; continue; } M3 } #undef M1 #undef M2 #define M1(i) wsptr[i] #define M2(i, tmp) outptr[i] = range_limit[DESCALE(tmp, CONST_BITS+PASS1_BITS+3) & RANGE_MASK]; wsptr = workspace; for (ctr = 0; ctr < DCTSIZE; ctr++, wsptr += DCTSIZE, outptr += stride) { #ifndef NO_ZERO_ROW_TEST if (!(M1(1) | M1(2) | M1(3) | M1(4) | M1(5) | M1(6) | M1(7))) { /* AC terms all zero */ JSAMPLE dcval = range_limit[DESCALE(wsptr[0], PASS1_BITS+3) & RANGE_MASK]; outptr[0] = dcval; outptr[1] = dcval; outptr[2] = dcval; outptr[3] = dcval; outptr[4] = dcval; outptr[5] = dcval; outptr[6] = dcval; outptr[7] = dcval; continue; } #endif M3 } #undef M1 #undef M2 #endif #undef M3 #undef ADD #undef SUB #undef MUL #undef SET1 #undef SHL } #endif // USE_JSIMD static void range_limit_init() { JSAMPLE *t = range_limit_static; #ifdef NEED_RANGELIMIT int i, c = CENTERJSAMPLE, m = c * 2; for (i = 0; i < c; i++) t[i] = i + c; while (i < 2 * m) t[i++] = m - 1; while (i < 3 * m + c) t[i++] = 0; for (i = 0; i < c; i++) t[3 * m + c + i] = i; #else (void)t; #endif } static void idct_float(float *in, float *out) { float t0, t1, t2, t3, t4, t5, t6, t7, z1, z2, z3, z4, z5; float *ws, buf[DCTSIZE2]; int i; #define M3(inc1, inc2) ws = buf; \ for (i = 0; i < DCTSIZE; i++, inc1, inc2) { \ z2 = M1(2); z3 = M1(6); \ z1 = (z2 + z3) * 0.541196100f; \ t2 = z1 - z3 * 1.847759065f; \ t3 = z1 + z2 * 0.765366865f; \ z2 = M1(0); z3 = M1(4); \ t0 = z2 + z3; t1 = z2 - z3; \ t4 = t0 + t3; t7 = t0 - t3; \ t5 = t1 + t2; t6 = t1 - t2; \ t0 = M1(7); t1 = M1(5); t2 = M1(3); t3 = M1(1); \ z1 = t0 + t3; z2 = t1 + t2; \ z3 = t0 + t2; z4 = t1 + t3; \ z5 = (z3 + z4) * 1.175875602f; \ t0 *= 0.298631336f; t1 *= 2.053119869f; \ t2 *= 3.072711026f; t3 *= 1.501321110f; \ z1 *= 0.899976223f; z2 *= 2.562915447f; \ z3 *= 1.961570560f; z4 *= 0.390180644f; \ z3 -= z5; t0 -= z1 + z3; t2 -= z2 + z3; \ z4 -= z5; t1 -= z2 + z4; t3 -= z1 + z4; \ M2(0, t4 + t3) M2(7, t4 - t3) \ M2(1, t5 + t2) M2(6, t5 - t2) \ M2(2, t6 + t1) M2(5, t6 - t1) \ M2(3, t7 + t0) M2(4, t7 - t0) \ } #define M1(i) in[DCTSIZE*i] #define M2(i, t) ws[DCTSIZE*i] = t; M3(in++, ws++) #undef M1 #undef M2 #define M1(i) ws[i] #define M2(i, t) out[i] = (t) * 0.125f; M3(ws += DCTSIZE, out += DCTSIZE) #undef M1 #undef M2 #undef M3 } static void fdct_float(float *in, float *out) { #define M3 \ z1 = M1(0); z2 = M1(7); t0 = ADD(z1, z2); t7 = SUB(z1, z2); \ z1 = M1(1); z2 = M1(6); t1 = ADD(z1, z2); t6 = SUB(z1, z2); \ z1 = M1(2); z2 = M1(5); t2 = ADD(z1, z2); t5 = SUB(z1, z2); \ z1 = M1(3); z2 = M1(4); t3 = ADD(z1, z2); t4 = SUB(z1, z2); \ z1 = ADD(t0, t3); z4 = SUB(t0, t3); \ z2 = ADD(t1, t2); z3 = SUB(t1, t2); \ M2(0, ADD(z1, z2)) M2(4, SUB(z1, z2)) \ z1 = MUL((ADD(z3, z4)), SET1(0.541196100f)); \ M2(2, ADD(z1, MUL(z4, SET1(0.765366865f)))) \ M2(6, SUB(z1, MUL(z3, SET1(1.847759065f)))) \ z1 = ADD(t4, t7); z2 = ADD(t5, t6); \ z3 = ADD(t4, t6); z4 = ADD(t5, t7); \ z5 = MUL(ADD(z3, z4), SET1(1.175875602f)); \ t4 = MUL(t4, SET1(0.298631336f)); t5 = MUL(t5, SET1(2.053119869f)); \ t6 = MUL(t6, SET1(3.072711026f)); t7 = MUL(t7, SET1(1.501321110f)); \ z1 = MUL(z1, SET1(0.899976223f)); z2 = MUL(z2, SET1(2.562915447f)); \ z3 = SUB(MUL(z3, SET1(1.961570560f)), z5); \ z4 = SUB(MUL(z4, SET1(0.390180644f)), z5); \ M2(7, SUB(t4, ADD(z1, z3))) M2(5, SUB(t5, ADD(z2, z4))) \ M2(3, SUB(t6, ADD(z2, z3))) M2(1, SUB(t7, ADD(z1, z4))) #if 1 && defined(USE_NEON) float32x4_t *ws, buf[DCTSIZE2] ALIGN(16); int i; float32x4_t t0, t1, t2, t3, t4, t5, t6, t7, z1, z2, z3, z4, z5; #define ADD vaddq_f32 #define SUB vsubq_f32 #define MUL vmulq_f32 #define SET1 vdupq_n_f32 ws = buf; for (i = 0; i < DCTSIZE; i += 4, in += 4, ws += 4) { #define M1(i) vld1q_f32(in + i * DCTSIZE) #define M2(i, t) ws[(i & 3) + (i & 4) * 2] = t; M3 #undef M1 #undef M2 } ws = buf; for (i = 0; i < DCTSIZE; i += 4, ws += 8, out += 4 * DCTSIZE) { float32x4_t c0 = SET1(0.125f); float32x4x4_t q0 = vld4q_f32((float*)&ws[0]), q1 = vld4q_f32((float*)&ws[4]); #define M1(i, n) float32x4_t x##i = q##n.val[i & 3]; M1(0, 0) M1(1, 0) M1(2, 0) M1(3, 0) M1(4, 1) M1(5, 1) M1(6, 1) M1(7, 1) #undef M1 #define M1(i) x##i #define M2(i, t) x##i = vmulq_f32(t, c0); M3 #undef M1 #undef M2 { float32x4x2_t p0, p1, p2; #define M1(i, j) p2 = vzipq_f32(p0.val[i], p1.val[i]); \ vst1q_f32(out + (i * 2) * DCTSIZE + j, p2.val[0]); \ vst1q_f32(out + (i * 2 + 1) * DCTSIZE + j, p2.val[1]); p0 = vzipq_f32(x0, x2); p1 = vzipq_f32(x1, x3); M1(0, 0) M1(1, 0) p0 = vzipq_f32(x4, x6); p1 = vzipq_f32(x5, x7); M1(0, 4) M1(1, 4) #undef M1 } } #elif 1 && defined(USE_AVX2) __m256 v0, v1, v2, v3, v4, v5, v6, v7, x0, x1, x2, x3, x4, x5, x6, x7; __m256 t0, t1, t2, t3, t4, t5, t6, t7, z1, z2, z3, z4, z5; #define ADD _mm256_add_ps #define SUB _mm256_sub_ps #define MUL _mm256_mul_ps #define SET1 _mm256_set1_ps #define M1(i) _mm256_loadu_ps(in + i * DCTSIZE) #define M2(i, t) x##i = t; M3 #undef M1 #undef M2 #define M5(v0, v1, v2, v3, k) \ v0 = _mm256_permute2f128_ps(x0, x4, k); \ v1 = _mm256_permute2f128_ps(x1, x5, k); \ v2 = _mm256_permute2f128_ps(x2, x6, k); \ v3 = _mm256_permute2f128_ps(x3, x7, k); #define M4(v0, v1, v2, v3, x0, x1, x2, x3) \ t0 = _mm256_unpacklo_ps(v0, v2); \ t1 = _mm256_unpacklo_ps(v1, v3); \ x0 = _mm256_unpacklo_ps(t0, t1); \ x1 = _mm256_unpackhi_ps(t0, t1); \ t0 = _mm256_unpackhi_ps(v0, v2); \ t1 = _mm256_unpackhi_ps(v1, v3); \ x2 = _mm256_unpacklo_ps(t0, t1); \ x3 = _mm256_unpackhi_ps(t0, t1); M5(v0, v1, v2, v3, 0x20) M5(v4, v5, v6, v7, 0x31) M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #define M1(i) x##i #define M2(i, t) x##i = MUL(t, SET1(0.125f)); M3 #undef M1 #undef M2 M5(v0, v1, v2, v3, 0x20) M5(v4, v5, v6, v7, 0x31) M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #undef M5 #undef M4 #define M1(i) _mm256_storeu_ps(out + i * DCTSIZE, x##i); M1(0) M1(1) M1(2) M1(3) M1(4) M1(5) M1(6) M1(7) #undef M1 #elif 1 && defined(USE_SSE2) __m128 *ws, buf[DCTSIZE2] ALIGN(16); int i; __m128 v0, v1, v2, v3, v4, v5, v6, v7, x0, x1, x2, x3, x4, x5, x6, x7; __m128 t0, t1, t2, t3, t4, t5, t6, t7, z1, z2, z3, z4, z5; #define ADD _mm_add_ps #define SUB _mm_sub_ps #define MUL _mm_mul_ps #define SET1 _mm_set1_ps ws = buf; for (i = 0; i < DCTSIZE; i += 4, in += 4, ws += 4) { #define M1(i) _mm_loadu_ps(in + i * DCTSIZE) #define M2(i, t) ws[(i & 3) + (i & 4) * 2] = t; M3 #undef M1 #undef M2 } ws = buf; for (i = 0; i < DCTSIZE; i += 4, ws += 8, out += 4 * DCTSIZE) { __m128 c0 = SET1(0.125f); #define M1(i) v##i = ws[i]; M1(0) M1(1) M1(2) M1(3) M1(4) M1(5) M1(6) M1(7) #undef M1 #define M4(v0, v1, v2, v3, x0, x1, x2, x3) \ t0 = _mm_unpacklo_ps(v0, v2); \ t1 = _mm_unpacklo_ps(v1, v3); \ x0 = _mm_unpacklo_ps(t0, t1); \ x1 = _mm_unpackhi_ps(t0, t1); \ t0 = _mm_unpackhi_ps(v0, v2); \ t1 = _mm_unpackhi_ps(v1, v3); \ x2 = _mm_unpacklo_ps(t0, t1); \ x3 = _mm_unpackhi_ps(t0, t1); M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #define M1(i) x##i #define M2(i, t) v##i = MUL(t, c0); M3 #undef M1 #undef M2 M4(v0, v1, v2, v3, x0, x1, x2, x3) M4(v4, v5, v6, v7, x4, x5, x6, x7) #undef M4 #define M1(i) _mm_storeu_ps(out + (i & 3) * DCTSIZE + (i & 4), x##i); M1(0) M1(1) M1(2) M1(3) M1(4) M1(5) M1(6) M1(7) #undef M1 } #else float *ws, buf[DCTSIZE2]; int i; float t0, t1, t2, t3, t4, t5, t6, t7, z1, z2, z3, z4, z5; #define ADD(a, b) ((a) + (b)) #define SUB(a, b) ((a) - (b)) #define MUL(a, b) ((a) * (b)) #define SET1(a) (a) #define M1(i) in[i * DCTSIZE] #define M2(i, t) ws[i * DCTSIZE] = t; ws = buf; for (i = 0; i < DCTSIZE; i++, in++, ws++) { M3 } #undef M1 #undef M2 #define M1(i) ws[i] #define M2(i, t) out[i] = (t) * 0.125f; ws = buf; for (i = 0; i < DCTSIZE; i++, ws += DCTSIZE, out += DCTSIZE) { M3 } #undef M1 #undef M2 #endif #undef M3 #undef ADD #undef SUB #undef MUL #undef SET1 } jpeg-quantsmooth-1.20210408/irfanview/000077500000000000000000000000001403361770700174225ustar00rootroot00000000000000jpeg-quantsmooth-1.20210408/irfanview/Makefile000066400000000000000000000005611403361770700210640ustar00rootroot00000000000000 LIBNAME ?= jpegqs.dll SONAME ?= JPEGQS.DLL MTOPTS ?= -fopenmp CFLAGS := -O2 -Wall -Wextra -pedantic -municode -I.. $(MTOPTS) JPEGLIB ?= -ljpeg -static .PHONY: all clean all: $(LIBNAME) clean: rm -f $(LIBNAME) $(LIBNAME): plugin.c $(CC) $(CFLAGS) -shared -Wl,-soname,$(SONAME) -o $@ $< $(JPEGLIB) -Wl,--version-script,plugin-vers.txt -Wl,--gc-sections strip $@ jpeg-quantsmooth-1.20210408/irfanview/build.sh000066400000000000000000000006131403361770700210550ustar00rootroot00000000000000#!/bin/sh jpeg=${1:-"jpeg-6b"} bits=${2:-""} lib="-ljpeg -static" [ -d $jpeg ] && lib="-I$jpeg $jpeg/libjpeg.a -static" name="JpegQS.dll" [ "$bits" ] && { mkdir -p x"$bits" name="x"$bits"/$name" } test -d ../winlib$bits && lib="$lib -L../winlib$bits" test -f ../ldscript$bits.txt && lib="$lib -Wl,-T,../ldscript$bits.txt" make JPEGLIB="../libjpegqs${bits}.a $lib" LIBNAME="$name" clean all jpeg-quantsmooth-1.20210408/irfanview/plugin-vers.txt000066400000000000000000000001061403361770700224330ustar00rootroot00000000000000/* default */ { global: GetPlugInInfo; LoadJPG; local: *; }; jpeg-quantsmooth-1.20210408/irfanview/plugin.c000066400000000000000000000143441403361770700210720ustar00rootroot00000000000000/* * Copyright (C) 2020 Ilya Kurdyukov * * JPEG reader plugin for IrfanView */ #include #include #include #include #include #include "jpeglib.h" #define WIN32_LEAN_AND_MEAN // conflict with libjpeg typedef #define INT32 INT32_WIN #include #ifdef _OPENMP #include #endif #include "libjpegqs.h" typedef BITMAPINFOHEADER bitmap_t; static bitmap_t *bitmap_create(int width, int height, int bpp) { bitmap_t *bm; // BMP needs 4-byte row alignment int stride = (width * bpp + 3) & -4; uint64_t size = (int64_t)stride * height + sizeof(bitmap_t) + (bpp == 1 ? 256 * 4 : 0); // check for overflow if ((unsigned)((width - 1) | (height - 1)) >= 0x10000 || (uint64_t)(SIZE_T)size != size) return NULL; bm = (bitmap_t*)GlobalAlloc(GMEM_FIXED, size); if (!bm) return bm; memset(bm, 0, sizeof(bitmap_t)); bm->biSize = sizeof(bitmap_t); bm->biWidth = width; bm->biHeight = height; bm->biPlanes = 1; bm->biBitCount = bpp * 8; bm->biSizeImage = height * stride; if (bpp == 1) { int i; int32_t *p = (int32_t*)(bm + 1); bm->biClrUsed = 256; for (i = 0; i < 256; i++) p[i] = i * 0x010101; } return bm; } static void bitmap_free(bitmap_t *in) { if (in) GlobalFree(in); } typedef struct { struct jpeg_error_mgr pub; wchar_t *errbuf; jmp_buf setjmp_buffer; } bitmap_jpeg_err_ctx; static inline void copyMsg(wchar_t *errbuf, const char *msg) { int i = 0; uint8_t a; do { errbuf[i] = a = msg[i]; i++; } while (a); } static void bitmap_jpeg_err(j_common_ptr cinfo) { char errorMsg[JMSG_LENGTH_MAX]; bitmap_jpeg_err_ctx* jerr = (bitmap_jpeg_err_ctx*)cinfo->err; (*(cinfo->err->format_message))(cinfo, errorMsg); copyMsg(jerr->errbuf, errorMsg); longjmp(jerr->setjmp_buffer, 1); } static bitmap_t* bitmap_read_jpeg(const wchar_t *filename, jpegqs_control_t *opts, wchar_t *errbuf, int grayscale) { struct jpeg_decompress_struct ci; FILE * volatile fp; int volatile ok = 0; bitmap_t * volatile bm = NULL; void * volatile mem = NULL; bitmap_jpeg_err_ctx jerr; ci.err = jpeg_std_error(&jerr.pub); jerr.errbuf = errbuf; jerr.pub.error_exit = bitmap_jpeg_err; if (!setjmp(jerr.setjmp_buffer)) do { unsigned x, y, w, h, st; uint8_t *data; bitmap_t *bm1; JSAMPROW *scanline; int bpp; jpeg_create_decompress(&ci); if (!(fp = _wfopen(filename, L"rb"))) { copyMsg(errbuf, "Error opening file for reading"); break; } jpeg_stdio_src(&ci, fp); jpeg_read_header(&ci, TRUE); bpp = grayscale || ci.num_components == 1 ? 1 : 3; ci.out_color_space = bpp == 1 ? JCS_GRAYSCALE : JCS_RGB; jpegqs_start_decompress(&ci, opts); w = ci.output_width; h = ci.output_height; bm = bm1 = bitmap_create(w, h, bpp); if (bm1) mem = scanline = (JSAMPROW*)malloc(h * sizeof(JSAMPROW)); if (!bm1 || !scanline) { copyMsg(errbuf, "Memory allocation failed"); break; } st = (w * bpp + 3) & -4; data = (uint8_t*)(bm1 + 1); if (bpp == 1) data += 256 * 4; // BMP uses reverse row order for (y = 0; y < h; y++) scanline[y] = (JSAMPLE*)(data + (h - 1 - y) * st); while ((y = ci.output_scanline) < h) jpeg_read_scanlines(&ci, scanline + y, h - y); if (bpp == 1) for (y = 0; y < h; y++) for (x = w; x < st; x++) data[y * st + x] = 0; else for (y = 0; y < h; y++) { JSAMPLE *p = data + y * st, t; // need to convert RGB to BGR for BMP for (x = 0; x < w * 3; x += 3) { t = p[x]; p[x] = p[x + 2]; p[x + 2] = t; } for (; x < st; x++) p[x] = 0; } ok = 1; jpegqs_finish_decompress(&ci); } while (0); if (mem) free(mem); if (!ok) { bitmap_free(bm); bm = NULL; } jpeg_destroy_decompress(&ci); if (fp) fclose(fp); return bm; } // Do only one processing at a time (need for MiniOMP). #define NEED_LOCK // Do processing in a separate thread. // Need when linking with libgomp from Mingw-w64, otherwise unstable. //#define SEPARATE_THREAD static HINSTANCE hInst = NULL; #ifdef NEED_LOCK static CRITICAL_SECTION CriticalSection; #define LOCK EnterCriticalSection(&CriticalSection); #define UNLOCK LeaveCriticalSection(&CriticalSection); #else #define LOCK #define UNLOCK #endif BOOL WINAPI DllMain(HINSTANCE hinstDLL, ULONG fdwReason, LPVOID lpvReserved) { (void)lpvReserved; if (fdwReason == DLL_PROCESS_ATTACH) { hInst = hinstDLL; #ifdef NEED_LOCK InitializeCriticalSection(&CriticalSection); #endif } else if (fdwReason == DLL_PROCESS_DETACH) { #ifdef NEED_LOCK DeleteCriticalSection(&CriticalSection); #endif } return TRUE; } #if defined(_MSC_VER) #define EXPORT __declspec(dllexport) #else #define EXPORT __attribute__((visibility("default"))) #endif static jpegqs_control_t jpegqs_default = { JPEGQS_DIAGONALS | JPEGQS_JOINT_YUV | JPEGQS_UPSAMPLE_UV, 3, -1, 0, NULL, NULL }; // threads < 0 : doesn't changes current number // threads = 0 : use all available cpu threads #ifdef SEPARATE_THREAD #define LoadJPG LoadJPG_orig static #else EXPORT #endif HANDLE LoadJPG(const wchar_t *filename, jpegqs_control_t *opts, wchar_t *errbuf, int grayscale, void *dummy) { bitmap_t *bm; (void)dummy; jpegqs_control_t opts2; if (!opts) opts = &jpegqs_default; memcpy(&opts2, opts, sizeof(opts2)); opts2.flags &= ~JPEGQS_TRANSCODE; if (grayscale) opts2.flags &= ~(JPEGQS_JOINT_YUV | JPEGQS_UPSAMPLE_UV); #ifndef SEPARATE_THREAD LOCK #endif bm = bitmap_read_jpeg(filename, &opts2, errbuf, grayscale); #ifndef SEPARATE_THREAD UNLOCK #endif return (HANDLE)bm; } #ifdef SEPARATE_THREAD #undef LoadJPG typedef struct { const wchar_t *filename; jpegqs_control_t *opts; wchar_t *errbuf; int grayscale; void *dummy; HANDLE result; } LoadJPGParams; static DWORD WINAPI LoadJPGThread(LPVOID lpParam) { LoadJPGParams *p = (LoadJPGParams*)lpParam; p->result = LoadJPG_orig(p->filename, p->opts, p->errbuf, p->grayscale, p->dummy); return 0; } EXPORT HANDLE LoadJPG(const wchar_t *filename, jpegqs_control_t *opts, wchar_t *errbuf, int grayscale, void *dummy) { LoadJPGParams p = { filename, opts, errbuf, grayscale, dummy, (HANDLE)NULL }; DWORD threadId; HANDLE thread; LOCK thread = CreateThread(NULL, 0, LoadJPGThread, (LPVOID)&p, 0, &threadId); WaitForSingleObject(thread, INFINITE); CloseHandle(thread); UNLOCK return p.result; } #endif EXPORT int GetPlugInInfo(char *version, char *name) { sprintf(version, JPEGQS_VERSION); sprintf(name, "JPEG Quant Smooth"); return 0; } jpeg-quantsmooth-1.20210408/jconfig.h000066400000000000000000000004501403361770700172170ustar00rootroot00000000000000#define NO_GETENV #ifndef XMD_H #define XMD_H #include typedef int16_t INT16; typedef int32_t INT32; #endif #define HAVE_PROTOTYPES #define HAVE_UNSIGNED_CHAR #define HAVE_UNSIGNED_SHORT #define HAVE_STDDEF_H #define HAVE_STDLIB_H #ifdef JPEG_INTERNALS #define INLINE inline #endif jpeg-quantsmooth-1.20210408/jpegqs-mini.c000066400000000000000000000446731403361770700200350ustar00rootroot00000000000000/* * Copyright (C) 2016-2020 Ilya Kurdyukov * * This file is part of jpeg quantsmooth (mini version) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ /* * A minimal JPEG Quant Smooth version for experiments. * Without SIMD optimizations and own DCT transforms, but has OpenMP pragmas. * Uses DCT transforms from libjpeg. * * Build with: make SRCNAME=jpegqs-mini.c */ #define JPEG_INTERNALS #ifdef WITH_JPEGSRC #include "jinclude.h" #include "jpeglib.h" #include "jdct.h" #else #include #include #include #include "jpeglib.h" #define NO_JPEGTRAN #ifndef WITH_SIMD #define DCTELEM int #else #define DCTELEM short #endif EXTERN(void) jpeg_idct_islow(j_decompress_ptr, jpeg_component_info*, JCOEFPTR, JSAMPARRAY, JDIMENSION); EXTERN(void) jpeg_fdct_islow(DCTELEM*); #define MEMCOPY memcpy #endif #include // X = cos(pi * n / 16) * sqrt(2) #define A 1.000000000 // 4 #define B 1.387039845 // 1 #define C 1.175875602 // 3 #define D 0.785694958 // 5 #define E 0.275899379 // 7 #define F 1.306562965 // 2 #define G 0.541196100 // 6 static const float idct_fcoef[DCTSIZE2] = { A, A, A, A, A, A, A, A, B, C, D, E, -E, -D, -C, -B, F, G, -G, -F, -F, -G, G, F, C, -E, -B, -D, D, B, E, -C, A, -A, -A, A, A, -A, -A, A, D, -B, E, C, -C, -E, B, -D, G, -F, F, -G, -G, F, -F, G, E, -D, C, -B, B, -C, D, -E }; #undef A #undef B #undef C #undef D #undef E #undef F #undef G #define IDCT_ISLOW(col) jpeg_idct_islow(srcinfo, compptr, coef, output_buf, col) static float dct_diff[DCTSIZE2][DCTSIZE2]; static const char zigzag_refresh[DCTSIZE2] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1 }; #define printf(...) fprintf(stderr, __VA_ARGS__) #define NUM_ITER 3 #define DIAGONALS #define JOINT_YUV #define UPSAMPLE_UV //#define LOW_QUALITY //#define NO_REBALANCE //#define NO_REBALANCE_UV static void quantsmooth_block(j_decompress_ptr srcinfo, jpeg_component_info *compptr, JCOEFPTR coef, UINT16 *quantval, JSAMPLE *image, JSAMPLE *image2, int stride, int luma) { int k, x, y, need_refresh = 1, n = DCTSIZE; JSAMPLE buf[DCTSIZE2], *output_buf[DCTSIZE]; #ifdef DIAGONALS float bcoef = 4.0; #else float bcoef = 2.0; #endif for (k = 0; k < n; k++) output_buf[k] = buf + k * n; if (image2) { #ifdef JOINT_YUV DCTELEM buf[DCTSIZE2]; for (y = 0; y < n; y++) for (x = 0; x < n; x++) { float sumA = 0, sumB = 0, sumAA = 0, sumAB = 0; float divN = 1.0f / 16, scale, offset; int a; #define M1(xx, yy) { \ float a = image2[(y + yy) * stride + x + xx]; \ float b = image[(y + yy) * stride + x + xx]; \ sumA += a; sumAA += a * a; \ sumB += b; sumAB += a * b; } #define M2(n) sumA *= n; sumB *= n; sumAA *= n; sumAB *= n; M1(0, 0) M2(2) M1(0, -1) M1(-1, 0) M1(1, 0) M1(0, 1) M2(2) M1(-1, -1) M1(1, -1) M1(-1, 1) M1(1, 1) #undef M2 #undef M1 scale = sumAA - sumA * divN * sumA; if (scale != 0.0f) scale = (sumAB - sumA * divN * sumB) / scale; scale = fminf(fmaxf(scale, -16.0f), 16.0f); offset = (sumB - scale * sumA) * divN; a = image2[y * stride + x] * scale + offset + 0.5f; a = a < 0 ? 0 : a > MAXJSAMPLE ? MAXJSAMPLE : a; buf[y * n + x] = a - CENTERJSAMPLE; } jpeg_fdct_islow(buf); for (x = 0; x < n * n; x++) { int div = quantval[x], coef1 = coef[x], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = (buf[x] + 4) >> 3; if (add > dh) add = dh; if (add < dl) add = dl; coef[x] = add; } #endif } #ifdef LOW_QUALITY (void)bcoef; (void)output_buf; (void)need_refresh; (void)srcinfo; (void)compptr; (void)zigzag_refresh; if (!image2) { DCTELEM buf[DCTSIZE2]; float range = 0, c0 = 2, c1 = c0 * sqrtf(0.5f); { int sum = 0; for (x = 1; x < n * n; x++) { int a = coef[x]; a = a < 0 ? -a : a; range += quantval[x] * a; sum += a; } if (sum) range *= 4.0f / sum; if (range > CENTERJSAMPLE) range = CENTERJSAMPLE; } for (y = 0; y < n; y++) for (x = 0; x < n; x++) { #define CORE(i, x, y) t0 = a - image[(y) * stride + (x)]; \ t = range - fabsf(t0); if (t < 0) t = 0; t *= t; aw = c##i * t; \ a0 += t0 * t * aw; an += aw * aw; int a = image[(y)*stride+(x)]; float a0 = 0, an = 0, aw, t, t0; CORE(1, x-1, y-1) CORE(0, x, y-1) CORE(1, x+1, y-1) CORE(0, x-1, y) CORE(0, x+1, y) CORE(1, x-1, y+1) CORE(0, x, y+1) CORE(1, x+1, y+1) #undef CORE if (an != 0.0f) a -= (int)roundf(a0 / an); buf[y * n + x] = a - CENTERJSAMPLE; } jpeg_fdct_islow(buf); for (x = 0; x < n * n; x++) { int div = quantval[x], coef1 = coef[x], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = (buf[x] + 4) >> 3; if (add > dh) add = dh; if (add < dl) add = dl; coef[x] = add; } } #else for (k = DCTSIZE2-1; k > 0; k--) { int p0, p1, i = jpeg_natural_order[k]; float *tab = dct_diff[i], a0, a1, t, a2 = 0, a3 = 0; int range = quantval[i] * 2; if (need_refresh && zigzag_refresh[i]) { IDCT_ISLOW(0); need_refresh = 0; } #define CORE t = (float)range - fabsf(a0); \ if (t < 0) t = 0; t *= t; a0 *= t; a1 *= t; a2 += a0 * a1; a3 += a1 * a1; #define M1(a, b) \ for (y = 0; y < n - 1 + a; y++) \ for (x = 0; x < n - 1 + b; x++) { \ p0 = y * n + x; p1 = (y + b) * n + x + a; \ a0 = buf[p0] - buf[p1]; a1 = tab[p0] - tab[p1]; CORE } #define M2(i, a, b) for (i = 0; i < n; i++) { p0 = y * n + x; \ a0 = buf[p0] - image[(b) * stride + a]; a1 = tab[p0] * bcoef; CORE } if (i & (n - 1)) M1(1, 0) y = 0; M2(x, x, y - 1) y = n - 1; M2(x, x, y + 1) x = 0; M2(y, x - 1, y) x = n - 1; M2(y, x + 1, y) if (i > (n - 1)) M1(0, 1) #ifdef DIAGONALS for (y = 0; y < 7; y++) for (x = 0; x < 7; x++) { p0 = y * n + x; p1 = (y + 1) * n + x; a0 = buf[p0] - buf[p1+1]; a1 = tab[p0] - tab[p1+1]; CORE a0 = buf[p0+1] - buf[p1]; a1 = tab[p0+1] - tab[p1]; CORE } #endif #undef M2 #undef M1 #undef CORE a2 = a2 / a3; range = roundf(a2); if (range) { int div = quantval[i], coef1 = coef[i], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = coef1 - range; if (add > dh) add = dh; if (add < dl) add = dl; coef[i] = add; need_refresh |= add ^ coef1; } } #endif (void)luma; #ifndef NO_REBALANCE #ifdef NO_REBALANCE_UV if (luma) #endif { JCOEF buf[DCTSIZE2]; int64_t m0 = 0, m1 = 0; for (k = 1; k < DCTSIZE2; k++) { int div = quantval[k], coef1 = coef[k], d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; buf[k] = a0; m0 += coef1 * a0; m1 += a0 * a0; } if (m1 > m0) { int mul = ((m1 << 13) + (m0 >> 1)) / m0; for (k = 1; k < DCTSIZE2; k++) { int div = quantval[k], coef1 = coef[k], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = buf[k]; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = (coef1 * mul + 0x1000) >> 13; if (add > dh) add = dh; if (add < dl) add = dl; coef[k] = add; } } } #endif } static void do_quantsmooth(j_decompress_ptr srcinfo, jvirt_barray_ptr *src_coef_arrays) { JDIMENSION comp_width, comp_height, blk_y; int i, ci, stride, iter, stride1 = 0, need_downsample = 0; jpeg_component_info *compptr; JQUANT_TBL *qtbl; JSAMPLE *image, *image1 = NULL, *image2 = NULL; JSAMPLE *output_buf[DCTSIZE], range_limit[11 * CENTERJSAMPLE]; MULTIPLIER dct_table1[DCTSIZE2]; (void)stride1; for (i = 0; i < DCTSIZE2; i++) dct_table1[i] = 1; /* // You need this code before calling jpeg_read_coefficients() // just to initialize the range limit table for jpeg_idct_islow() srcinfo.buffered_image = TRUE; jpeg_start_decompress(&srcinfo); while (!jpeg_input_complete(&srcinfo)) { jpeg_start_output(&srcinfo, srcinfo.input_scan_number); jpeg_finish_output(&srcinfo); } // Or you can fill out the table by yourself. */ { int c = CENTERJSAMPLE, m = c * 2; JSAMPLE *t = range_limit; for (i = 0; i < m; i++) t[i] = 0; t += m; srcinfo->sample_range_limit = t; for (i = 0; i < m; i++) t[i] = i; for (; i < 2 * m + c; i++) t[i] = m - 1; for (; i < 4 * m; i++) t[i] = 0; for (i = 0; i < c; i++) t[4 * m + i] = i; } #ifndef LOW_QUALITY for (i = 0; i < DCTSIZE2; i++) { int x, y; float m, *tab = dct_diff[i]; for (y = 0; y < DCTSIZE; y++) { m = idct_fcoef[i / DCTSIZE * DCTSIZE + y] * (1.0f / DCTSIZE); for (x = 0; x < DCTSIZE; x++) tab[y * DCTSIZE + x] = m * idct_fcoef[i % DCTSIZE * DCTSIZE + x]; } } #else (void)dct_diff; (void)idct_fcoef; #endif #if defined(JOINT_YUV) || defined(UPSAMPLE_UV) compptr = srcinfo->comp_info; if (srcinfo->jpeg_color_space == JCS_YCbCr && !((compptr[1].h_samp_factor - 1) | (compptr[1].v_samp_factor - 1) | (compptr[2].h_samp_factor - 1) | (compptr[2].v_samp_factor - 1))) { need_downsample = 1; } #endif for (ci = 0; ci < srcinfo->num_components; ci++) { int extra_refresh = 0; compptr = srcinfo->comp_info + ci; comp_width = compptr->width_in_blocks; comp_height = compptr->height_in_blocks; if (!(qtbl = compptr->quant_table)) continue; stride = comp_width * DCTSIZE + 2; image = (JSAMPLE*)malloc((comp_height * DCTSIZE + 2) * stride * sizeof(JSAMPLE)); if (!image) continue; #define IMAGEPTR (blk_y * DCTSIZE + 1) * stride + blk_x * DCTSIZE + 1 compptr->dct_table = dct_table1; for (i = 0; i < DCTSIZE; i++) output_buf[i] = image + i * stride; if (image1 || (!ci && need_downsample)) extra_refresh = 1; for (iter = 0; iter < NUM_ITER + extra_refresh; iter++) { #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y, 1, TRUE); for (blk_x = 0; blk_x < comp_width; blk_x++) { JCOEFPTR coef = buffer[0][blk_x]; int i; if (!iter) for (i = 0; i < DCTSIZE2; i++) coef[i] *= qtbl->quantval[i]; IDCT_ISLOW(IMAGEPTR); } } { int y, w = comp_width * DCTSIZE, h = comp_height * DCTSIZE; for (y = 1; y < h + 1; y++) { image[y * stride] = image[y * stride + 1]; image[y * stride + w + 1] = image[y * stride + w]; } MEMCOPY(image, image + stride, stride * sizeof(JSAMPLE)); MEMCOPY(image + (h + 1) * stride, image + h * stride, stride * sizeof(JSAMPLE)); } if (iter == NUM_ITER) break; #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y, 1, TRUE); for (blk_x = 0; blk_x < comp_width; blk_x++) quantsmooth_block(srcinfo, compptr, buffer[0][blk_x], qtbl->quantval, image + IMAGEPTR, image2 ? image2 + IMAGEPTR : NULL, stride, !ci || srcinfo->jpeg_color_space != JCS_YCbCr); } } // iter #ifdef UPSAMPLE_UV if (image1) { JSAMPLE *mem; int st, w1, h1, ws, hs; compptr = srcinfo->comp_info; ws = compptr[0].h_samp_factor; hs = compptr[0].v_samp_factor; w1 = (srcinfo->image_width + ws - 1) / ws; h1 = (srcinfo->image_height + hs - 1) / hs; comp_width = compptr[0].width_in_blocks; comp_height = compptr[0].height_in_blocks; src_coef_arrays[ci] = (*srcinfo->mem->request_virt_barray) ((j_common_ptr)srcinfo, JPOOL_IMAGE, FALSE, comp_width, comp_height, 1); (*srcinfo->mem->realize_virt_arrays) ((j_common_ptr)srcinfo); #ifdef _OPENMP // need to suppress JERR_BAD_VIRTUAL_ACCESS for (blk_y = 0; blk_y < comp_height; blk_y++) { (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y, 1, TRUE); } #endif st = comp_width * DCTSIZE; mem = (JSAMPLE*)malloc(comp_height * DCTSIZE * st * sizeof(JSAMPLE)); if (mem) { int y, ww = comp_width * DCTSIZE, hh = comp_height * DCTSIZE; #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (y = 0; y < h1; y++) { int x, xx, yy, a, h2 = hh - y * hs; h2 = h2 < hs ? h2 : hs; for (x = 0; x < w1; x++) { JSAMPLE *p = mem + y * hs * st + x * ws; JSAMPLE *p1 = image1 + (y * hs + 1) * stride1 + x * ws + 1; int w2 = ww - x * ws; w2 = w2 < ws ? w2 : ws; float sumA = 0, sumB = 0, sumAA = 0, sumAB = 0; float divN = 1.0f / 16, scale, offset; int a; #define M1(xx, yy) { \ float a = image2[(y + yy + 1) * stride + x + xx + 1]; \ float b = image[(y + yy + 1) * stride + x + xx + 1]; \ sumA += a; sumAA += a * a; \ sumB += b; sumAB += a * b; } #define M2(n) sumA *= n; sumB *= n; sumAA *= n; sumAB *= n; M1(0, 0) M2(2) M1(0, -1) M1(-1, 0) M1(1, 0) M1(0, 1) M2(2) M1(-1, -1) M1(1, -1) M1(-1, 1) M1(1, 1) #undef M2 #undef M1 scale = sumAA - sumA * divN * sumA; if (scale != 0.0f) scale = (sumAB - sumA * divN * sumB) / scale; scale = fminf(fmaxf(scale, -16.0f), 16.0f); // offset = (sumB - scale * sumA) * divN; a = image2[(y + 1) * stride + x + 1]; offset = image[(y + 1) * stride + x + 1] - a * scale; for (yy = 0; yy < h2; yy++) for (xx = 0; xx < w2; xx++) { a = p1[yy * stride1 + xx] * scale + offset + 0.5f; p[yy * st + xx] = a < 0 ? 0 : a > MAXJSAMPLE ? MAXJSAMPLE : a; } } a = mem[y * st + w1 * ws - 1]; for (x = w1 * ws; x < ww; x++) mem[y * st + x] = a; } for (y = h1 * hs; y < hh; y++) MEMCOPY(mem + y * st, mem + (h1 * hs - 1) * st, st * sizeof(JSAMPLE)); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y, 1, TRUE); for (blk_x = 0; blk_x < comp_width; blk_x++) { DCTELEM buf[DCTSIZE2]; int x, y, n = DCTSIZE; JSAMPLE *p = mem + blk_y * n * st + blk_x * n; JCOEFPTR coef = buffer[0][blk_x]; for (y = 0; y < n; y++) for (x = 0; x < n; x++) buf[y * n + x] = p[y * st + x] - CENTERJSAMPLE; jpeg_fdct_islow(buf); for (x = 0; x < n * n; x++) coef[x] = (buf[x] + 4) >> 3; } } free(mem); } } else #endif #if defined(JOINT_YUV) || defined(UPSAMPLE_UV) if (!ci && need_downsample) do { // make downsampled copy of Y component int y, w, h, w1, h1, st, ws, hs; ws = compptr[0].h_samp_factor; hs = compptr[0].v_samp_factor; if ((ws - 1) | (hs - 1)) { #ifdef UPSAMPLE_UV image1 = image; stride1 = stride; #endif } else { image2 = image; break; } w = compptr[1].width_in_blocks * DCTSIZE; h = compptr[1].height_in_blocks * DCTSIZE; st = w + 2; image2 = (JSAMPLE*)malloc((h + 2) * st * sizeof(JSAMPLE)); if (!image2) break; w1 = (comp_width * DCTSIZE + ws - 1) / ws; h1 = (comp_height * DCTSIZE + hs - 1) / hs; #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (y = 0; y < h1; y++) { int x, h2 = comp_height * DCTSIZE - y * hs; h2 = h2 < hs ? h2 : hs; for (x = 0; x < w1; x++) { JSAMPLE *p = image + (y * hs + 1) * stride + x * ws + 1; int xx, yy, sum = 0, w2 = comp_width * DCTSIZE - x * ws, div; w2 = w2 < ws ? w2 : ws; div = w2 * h2; for (yy = 0; yy < h2; yy++) for (xx = 0; xx < w2; xx++) sum += p[yy * stride + xx]; image2[(y + 1) * st + x + 1] = (sum + div / 2) / div; } } for (y = 1; y < h1 + 1; y++) { int x; JSAMPLE a = image2[y * st + w1]; image2[y * st] = image2[y * st + 1]; for (x = w1 + 1; x < w + 2; x++) image2[y * st + x] = a; } MEMCOPY(image2, image2 + st, st * sizeof(JSAMPLE)); for (y = h1 + 1; y < h + 2; y++) MEMCOPY(image2 + y * st, image2 + h1 * st, st * sizeof(JSAMPLE)); } while (0); #endif // JOINT_YUV || UPSAMPLE_UV #undef IMAGEPTR if (image != image1 && image != image2) free(image); } if (image2 != image1 && image2) free(image2); if (image1) { srcinfo->max_h_samp_factor = 1; srcinfo->max_v_samp_factor = 1; srcinfo->comp_info[0].h_samp_factor = 1; srcinfo->comp_info[0].v_samp_factor = 1; free(image1); } for (ci = 0; ci < NUM_QUANT_TBLS; ci++) { qtbl = srcinfo->quant_tbl_ptrs[ci]; if (qtbl) for (i = 0; i < DCTSIZE2; i++) qtbl->quantval[i] = 1; } for (ci = 0; ci < srcinfo->num_components; ci++) { qtbl = srcinfo->comp_info[ci].quant_table; if (qtbl) for (i = 0; i < DCTSIZE2; i++) qtbl->quantval[i] = 1; } } #ifndef NO_JPEGTRAN // Macro for inserting quantsmooth into jpegtran code. #define jpeg_read_coefficients(srcinfo) \ jpeg_read_coefficients(srcinfo); \ do_quantsmooth(srcinfo, src_coef_arrays) #include "jpegtran.c" #else #ifdef _WIN32 #define USE_SETMODE #endif #ifdef USE_SETMODE #include #include #endif // Usage: ./main [-optimize] < input.jpg > output.jpg int main(int argc, char **argv) { struct jpeg_decompress_struct srcinfo; struct jpeg_compress_struct dstinfo; struct jpeg_error_mgr jerr; jvirt_barray_ptr *src_coef_arrays; #ifdef USE_SETMODE setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif srcinfo.err = dstinfo.err = jpeg_std_error(&jerr); jpeg_create_decompress(&srcinfo); jpeg_create_compress(&dstinfo); jpeg_stdio_src(&srcinfo, stdin); jpeg_read_header(&srcinfo, TRUE); src_coef_arrays = jpeg_read_coefficients(&srcinfo); do_quantsmooth(&srcinfo, src_coef_arrays); jpeg_copy_critical_parameters(&srcinfo, &dstinfo); if (argc > 1 && !strcmp(argv[1], "-optimize")) dstinfo.optimize_coding = TRUE; jpeg_stdio_dest(&dstinfo, stdout); jpeg_write_coefficients(&dstinfo, src_coef_arrays); jpeg_finish_compress(&dstinfo); jpeg_destroy_compress(&dstinfo); jpeg_finish_decompress(&srcinfo); jpeg_destroy_decompress(&srcinfo); return jerr.num_warnings ? 2 : 0; } #endif jpeg-quantsmooth-1.20210408/jpegqs.1000066400000000000000000000040531403361770700170050ustar00rootroot00000000000000.TH jpegqs 1 "08 Apr 2021" "JPEG Quant Smooth" .SH NAME .B JPEG Quant Smooth \- JPEG artifacts removal .SH DESCRIPTION This program tries to recover the lost precision of DCT coefficients based on a quantization table from a JPEG image. The result is saved as a JPEG image with quantization set to 1 (like a JPEG saved at 100% quality). .SH SYNOPSIS .B jpegqs .IR "" [ options ] .I input.jpg .I output.jpg .SH OPTIONS .TP .B \-q, \-\-quality n Quality setting (0\-6, default is 3)/home/rainbow/Desktop/pngtest/jpegrestore/jpegqs.1 .TP .B \-n, \-\-niter n Number of iterations (default is 3) .IP * More iterations can make the result look like CG art, can make the photos look unnatural. .TP .B \-t, \-\-threads n Set the number of CPU threads to use .TP .B \-o, \-\-optimize Option for libjpeg to produce smaller output file .TP .B \-v, \-\-verbose n Print libjpeg debug output .TP .B \-i, \-\-info n Print quantsmooth debug output (default is 15). .RS Use the sum of flags: .RS 0 \- silent .RE .RS 1/2/4 \- various information .RE .RS 8 \- processing time .RE .RS 16 \- SIMD type .RE .PP * The processing time includes only the smoothing algorithm, jpeg reading and writing time is not included. .RE .TP .B \-p, \-\-cpu n Use to lower the SIMD type if CPU detection fails: .RS .RS 0 \- auto .RE .RS 1 \- scalar .RE .RS 2 \- SSE2 .RE .RS 3 \- AVX2 .RE .RS 4 \- AVX512 .RE .PP * x86 build selects between modes 1\-3, x86_64 from 2\-4 .RE .SH QUALITY .TP The quality setting sets a combination of flags for processing: .TP 3. default .TP 4. adds DIAGONALS flag smoother diagonal edges, ~1.5 times slower .TP 5. adds JOINT_YUV flag chroma channels will depend from luminance, better color consistency .TP 6. adds UPSAMPLE_UV flag non-blurring chroma upsampling, unlike fancy upsampling from libjpeg .TP levels 0\-2 is the same as 4\-6, but with LOW_QUALITY flag .RS .RE ~10 times faster, but the quality is lower .RS .RE LOW_QUALITY implies DIAGONALS (always set) .SH PROJECT PAGE .TP https://github.com/ilyakurdyukov/jpeg\-quantsmooth .SH COPYRIGHT .TP Copyright (C) 2016\-2021 Ilya Kurdyukov jpeg-quantsmooth-1.20210408/libjpegqs.c000066400000000000000000000064541403361770700175650ustar00rootroot00000000000000/* * Copyright (C) 2020 Ilya Kurdyukov * * This file is part of jpeg quantsmooth (library) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include #include #include #if !defined(TRANSCODE_ONLY) && defined(WITH_JPEGSRC) #define JPEG_INTERNALS #endif #include "jpeglib.h" #define logfmt(...) fprintf(stderr, __VA_ARGS__) #if defined(SIMD_SELECT) && !defined(SIMD_NAME) #ifndef JPEGQS_ATTR #define JPEGQS_ATTR #endif #include "libjpegqs.h" #define QS_ARGS (j_decompress_ptr srcinfo, jvirt_barray_ptr *coef_arrays, jpegqs_control_t *opts) #define M1(name) int do_quantsmooth_##name QS_ARGS; M1(base) M1(sse2) M1(avx2) M1(avx512) #undef M1 #ifdef _MSC_VER #include // void __cpuidex(int cpuInfo[4], int function_id, int subfunction_id); // unsigned __int64 _xgetbv(unsigned int); #define get_cpuid(a, c, out) __cpuidex(out, a, c) #define xgetbv(n) _xgetbv(n) #else static inline void get_cpuid(int32_t a, int32_t c, int32_t out[4]) { __asm__ ("cpuid\n" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(a), "c"(c)); } static inline int64_t xgetbv(int32_t n) { uint32_t eax, edx; __asm__ ("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(n)); return ((int64_t)edx << 32) | eax; } #endif JPEGQS_ATTR int do_quantsmooth QS_ARGS { int type = 1; do { int32_t cpuid[4], m, xcr0; get_cpuid(0, 0, cpuid); m = cpuid[0]; if (m < 1) break; get_cpuid(1, 0, cpuid); if (!(cpuid[3] & (1 << 25))) break; // SSE if (!(cpuid[3] & (1 << 26))) break; // SSE2 type = 2; // VirtualBox clears FMA, even if AVX2 is set // if (!(cpuid[2] & (1 << 12))) break; // FMA if (!(cpuid[2] & (1 << 27))) break; // OSXSAVE if (!(cpuid[2] & (1 << 28))) break; // AVX xcr0 = ~xgetbv(0); if (m < 7) break; get_cpuid(7, 0, cpuid); if (!(cpuid[1] & (1 << 5)) || xcr0 & 6) break; // AVX2 type = 3; if (!(cpuid[1] & (1 << 16)) || xcr0 & 0xe6) break; // AVX512F type = 4; } while (0); { int x = (opts->flags >> JPEGQS_CPU_SHIFT) & JPEGQS_CPU_MASK; if (x) type = x < type ? x : type; } #ifdef WITH_LOG if (opts->flags & JPEGQS_INFO_CPU) { logfmt("SIMD type: %i\n", type); } #endif #define M1(name) return do_quantsmooth_##name(srcinfo, coef_arrays, opts); #ifdef __x86_64__ if (type >= 4) M1(avx512) #endif if (type >= 3) M1(avx2) #ifndef __x86_64__ if (type >= 2) M1(sse2) #endif M1(base) #undef M1 } #else #ifdef SIMD_NAME #define QS_CONCAT(x) do_quantsmooth_##x #define QS_NAME1(x) QS_CONCAT(x) #define QS_NAME QS_NAME1(SIMD_NAME) #ifndef SIMD_BASE #define NO_HELPERS #endif #endif #define JPEGQS_ATTR #if !(defined(SIMD_SSE2) && defined(__x86_64__)) && \ !(defined(SIMD_AVX512) && !defined(__x86_64__)) #include "quantsmooth.h" #endif #endif jpeg-quantsmooth-1.20210408/libjpegqs.h000066400000000000000000000024661403361770700175710ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Ilya Kurdyukov * * JPEG Quant Smooth API definitions */ #ifndef JPEGQS_H #define JPEGQS_H #ifdef __cplusplus extern "C" { #endif enum { JPEGQS_ITER_MAX = 100, JPEGQS_DIAGONALS = 1, JPEGQS_JOINT_YUV = 2, JPEGQS_UPSAMPLE_UV = 4, JPEGQS_LOW_QUALITY = 8, JPEGQS_NO_REBALANCE = 16, JPEGQS_NO_REBALANCE_UV = 32, JPEGQS_TRANSCODE = 64, JPEGQS_FLAGS_MASK = 0x7f, JPEGQS_CPU_SHIFT = 12, JPEGQS_CPU_MASK = 15, JPEGQS_INFO_SHIFT = 16, JPEGQS_INFO_COMP1 = 1 << JPEGQS_INFO_SHIFT, JPEGQS_INFO_QUANT = 2 << JPEGQS_INFO_SHIFT, JPEGQS_INFO_COMP2 = 4 << JPEGQS_INFO_SHIFT, JPEGQS_INFO_TIME = 8 << JPEGQS_INFO_SHIFT, JPEGQS_INFO_CPU = 16 << JPEGQS_INFO_SHIFT }; #ifndef JPEGQS_ATTR #define JPEGQS_ATTR #endif #define JPEGQS_VERSION "1.20210408" #define JPEGQS_COPYRIGHT "Copyright (C) 2020-2021 Ilya Kurdyukov" typedef struct { int flags, niter, threads, progprec; void *userdata; int (*progress)(void *data, int cur, int max); } jpegqs_control_t; JPEGQS_ATTR int do_quantsmooth(j_decompress_ptr srcinfo, jvirt_barray_ptr *coef_arrays, jpegqs_control_t *opts); #ifndef TRANSCODE_ONLY JPEGQS_ATTR boolean jpegqs_start_decompress(j_decompress_ptr cinfo, jpegqs_control_t *opts); JPEGQS_ATTR boolean jpegqs_finish_decompress(j_decompress_ptr cinfo); #endif #ifdef __cplusplus } #endif #endif jpeg-quantsmooth-1.20210408/quantsmooth.c000066400000000000000000000422171403361770700201640ustar00rootroot00000000000000/* * Copyright (C) 2016-2020 Ilya Kurdyukov * * This file is part of jpeg quantsmooth * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include #include #include #define STRINGIFY(s) #s #define TOSTRING(s) STRINGIFY(s) #ifdef __EMSCRIPTEN__ #include #else #define EMSCRIPTEN_KEEPALIVE #endif #if defined(WASM_MAIN) && !defined(WASM) #define WASM #endif #ifdef __cplusplus extern "C" { #endif #include "jpeglib.h" #ifdef WITH_JPEGSRC #include "jversion.h" #endif #ifdef __cplusplus } #endif #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN // conflict with libjpeg typedef #define INT32 INT32_WIN #include #define USE_SETMODE #endif #ifdef USE_SETMODE #include #include #endif #ifdef WASM #define logfmt(...) printf(__VA_ARGS__) #else #define logfmt(...) fprintf(stderr, __VA_ARGS__) #endif #define WITH_LOG #ifdef WASM_ASYNC EM_JS(void, js_progress, (int percent), { Module["wasm_progress"](percent); // Asyncify.handleSleep(function(wakeUp) { setTimeout(wakeUp, 0); }); }); #define PRECISE_PROGRESS #define PROGRESS_PTR wasm_progress static int wasm_progress(void *data, int cur, int max) { int percent = 100 * cur / max; js_progress(percent); emscripten_sleep(0); return 0; } #endif #define TRANSCODE_ONLY #ifdef SIMD_SELECT #define JPEGQS_ATTR static #include "libjpegqs.c" #else #include "quantsmooth.h" #endif #define CONCAT(a, b) a##b #ifdef UNICODE #define S(s) CONCAT(L, s) #define LS "%S" #else #define S(s) s #define LS "%s" #endif #ifndef TCHAR #ifdef UNICODE #define TCHAR wchar_t #else #define TCHAR char #endif #endif #ifdef WASM #define MEM_INPUT #define MEM_OUTPUT #endif #ifdef MEM_INPUT #if !defined(WASM) || defined(WASM_MAIN) static uint8_t* loadfile(const char *fn, size_t *num) { size_t n, j = 0; uint8_t *buf = 0; FILE *fi = fopen(fn, "rb"); if (fi) { fseek(fi, 0, SEEK_END); n = ftell(fi); fseek(fi, 0, SEEK_SET); if (n) { buf = (uint8_t*)malloc(n); if (buf) j = fread(buf, 1, n, fi); } fclose(fi); } if (num) *num = j; return buf; } #endif void jpeg_init_source(j_decompress_ptr cinfo) { (void)cinfo; } boolean jpeg_fill_input_buffer(j_decompress_ptr cinfo) { (void)cinfo; return FALSE; } void jpeg_skip_input_data(j_decompress_ptr cinfo, long num_bytes) { struct jpeg_source_mgr *src = cinfo->src; if ((size_t)num_bytes > src->bytes_in_buffer) num_bytes = src->bytes_in_buffer; src->next_input_byte += (size_t)num_bytes; src->bytes_in_buffer -= (size_t)num_bytes; } void jpeg_term_source(j_decompress_ptr cinfo) { (void)cinfo; } #endif #ifdef MEM_OUTPUT struct jpeg_dest_mem { struct jpeg_destination_mgr pub; /* public fields */ uint8_t *buffer; size_t size, bufsize, maxchunk; }; static void jpeg_init_destination(j_compress_ptr cinfo) { struct jpeg_dest_mem *p = (struct jpeg_dest_mem*)cinfo->dest; if (!p->buffer) { p->buffer = (uint8_t*)malloc(p->bufsize); } p->pub.next_output_byte = p->buffer; p->pub.free_in_buffer = p->bufsize; } static boolean jpeg_empty_output_buffer(j_compress_ptr cinfo) { struct jpeg_dest_mem *p = (struct jpeg_dest_mem*)cinfo->dest; size_t offset = p->bufsize, next; p->size = offset; next = p->bufsize; if (next > p->maxchunk) next = p->maxchunk; p->bufsize += next; p->buffer = (uint8_t*)realloc(p->buffer, p->bufsize); p->pub.next_output_byte = p->buffer + offset; p->pub.free_in_buffer = p->bufsize - offset; return TRUE; } static void jpeg_term_destination(j_compress_ptr cinfo) { struct jpeg_dest_mem *p = (struct jpeg_dest_mem*)cinfo->dest; p->size = p->bufsize - p->pub.free_in_buffer; } #endif #ifdef WASM static char** make_argv(char *str, int *argc_ret) { int i = 0, eol = 0, argc = 1; char **argv; for (;;) { char a = str[i++]; if (!a) break; if (eol) { if (a == eol) eol = 0; } else { if (a != ' ') { eol = ' '; if (a == '"' || a == '\'') eol = a; argc++; } } } *argc_ret = argc; argv = (char**)malloc(argc * sizeof(char*)); if (!argv) return argv; argv[0] = NULL; eol = 0; argc = 1; for (;;) { char a = *str++; if (!a) break; if (eol) { if (a == eol) { str[-1] = 0; eol = 0; } } else { if (a != ' ') { eol = ' '; if (a == '"' || a == '\'') { eol = a; str++; } argv[argc++] = str - 1; } } } return argv; } static int web_process(int64_t *params) { char *cmdline = (char*)params[0]; #else #ifdef _WIN32 typedef struct { HWND hwnd; } progress_data_t; static int progress(void *data, int cur, int max) { progress_data_t *prog = (progress_data_t*)data; int percent = 100 * cur / max; PostMessage(prog->hwnd, WM_USER, (WPARAM)percent, 0); return 0; } #endif #ifdef UNICODE // unicode hacks #define strcmp(a, b) wcscmp(a, S(b)) #define atoi(a) _wtoi(a) #define fopen(a, b) _wfopen(a, S(b)) #pragma GCC diagnostic ignored "-Wformat" int wmain(int argc, wchar_t **argv) { #else int main(int argc, char **argv) { #endif #endif struct jpeg_decompress_struct srcinfo; struct jpeg_compress_struct dstinfo; struct jpeg_error_mgr jsrcerr, jdsterr; jvirt_barray_ptr *coef_arrays; #ifdef MEM_INPUT size_t input_size = 0; uint8_t *input_mem = NULL; struct jpeg_source_mgr src_mgr = { 0 }; #else FILE *input_file = stdin; #endif #ifdef MEM_OUTPUT struct jpeg_dest_mem dest_mgr = { { 0 }, NULL, 0, 0x10000, 0x100000 }; #endif #ifndef WASM FILE *output_file = stdout; #endif int optimize = 0, jpeg_verbose = 0, cmd_info = 15, cmd_cpu = 0, cmd_copy = 2; int quality = 3, cmd_niter = -1, cmd_flags = -1; jpegqs_control_t opts = { 0 }; #ifdef _WIN32 progress_data_t prog; #endif #ifdef WASM int argc = 0; char **argv_ptr = make_argv(cmdline, &argc), **argv = argv_ptr; #else #ifndef APPNAME const TCHAR *progname = argv[0], *fn; #else const TCHAR *progname = S(TOSTRING(APPNAME)), *fn; #endif #endif #ifdef _WIN32 if (argc > 2 && !strcmp(argv[1], "--hwnd")) { prog.hwnd = (HWND)(intptr_t)atoi(argv[2]); opts.userdata = &prog; if (prog.hwnd) opts.progress = progress; argv += 2; argc -= 2; } #endif #ifdef WASM_ASYNC opts.progprec = 20; opts.progress = wasm_progress; #endif while (argc > 1) { const TCHAR *arg1 = argv[1], *arg2 = argc > 2 ? argv[2] : NULL, *arg = arg1; TCHAR c; if (arg[0] != '-' || !(c = arg[1])) break; if (c != '-') switch (c) { case 'o': arg = S("--optimize"); c = 0; break; case 'v': arg = S("--verbose"); break; case 'i': arg = S("--info"); break; case 'n': arg = S("--niter"); break; case 'q': arg = S("--quality"); break; case 't': arg = S("--threads"); break; case 'f': arg = S("--flags"); break; case 'p': arg = S("--cpu"); break; case 'c': arg = S("--copy"); break; default: c = '-'; } if (c != '-' && arg1[2]) { if (!c) break; arg2 = arg1 + 2; argc++; argv--; } #define CHECKNUM if ((unsigned)(arg2[0] - '0') > 9) break; switch (arg[2]) { case 'o': if (!strcmp(arg, "--optimize")) { optimize = 1; argv++; argc--; arg = NULL; } break; case 'v': if (argc > 2 && !strcmp(arg, "--verbose")) { CHECKNUM jpeg_verbose = atoi(arg2); argv += 2; argc -= 2; arg = NULL; } break; case 'i': if (argc > 2 && !strcmp(arg, "--info")) { CHECKNUM cmd_info = atoi(arg2); argv += 2; argc -= 2; arg = NULL; } break; case 'n': if (argc > 2 && !strcmp(arg, "--niter")) { CHECKNUM cmd_niter = atoi(arg2); argv += 2; argc -= 2; arg = NULL; } break; case 'q': if (argc > 2 && !strcmp(arg, "--quality")) { CHECKNUM quality = atoi(arg2); argv += 2; argc -= 2; arg = NULL; } break; case 't': if (argc > 2 && !strcmp(arg, "--threads")) { CHECKNUM opts.threads = atoi(arg2); argv += 2; argc -= 2; arg = NULL; } break; case 'f': if (argc > 2 && !strcmp(arg, "--flags")) { CHECKNUM cmd_flags = atoi(arg2); cmd_flags &= JPEGQS_FLAGS_MASK; argv += 2; argc -= 2; arg = NULL; } break; case 'c': if (argc > 2 && !strcmp(arg, "--cpu")) { CHECKNUM cmd_cpu = atoi(arg2); if (cmd_cpu > JPEGQS_CPU_MASK) cmd_cpu = JPEGQS_CPU_MASK; argv += 2; argc -= 2; arg = NULL; } else if (argc > 2 && !strcmp(arg, "--copy")) { CHECKNUM cmd_copy = atoi(arg2); argv += 2; argc -= 2; arg = NULL; } break; case 0: if (!strcmp(arg, "--")) { argv++; argc--; } break; } if (arg) break; } { int niter = 3, flags = 0; if (quality < 3) { flags |= JPEGQS_LOW_QUALITY; quality += 4; } if (quality >= 4) flags |= JPEGQS_DIAGONALS; if (quality >= 5) flags |= JPEGQS_JOINT_YUV; if (quality >= 6) flags |= JPEGQS_UPSAMPLE_UV; opts.niter = cmd_niter >= 0 ? cmd_niter : niter; opts.flags = (cmd_flags >= 0 ? cmd_flags : flags) | JPEGQS_TRANSCODE; opts.flags |= cmd_cpu << JPEGQS_CPU_SHIFT; opts.flags |= cmd_info << JPEGQS_INFO_SHIFT; } #ifdef WASM free(argv_ptr); if (argc != 1) { logfmt("Unrecognized command line option.\n"); return 1; } #endif srcinfo.err = jpeg_std_error(&jsrcerr); if (jpeg_verbose) { #ifdef LIBJPEG_TURBO_VERSION logfmt("Compiled with libjpeg-turbo version %s\n", TOSTRING(LIBJPEG_TURBO_VERSION)); #else logfmt("Compiled with libjpeg version %d\n", JPEG_LIB_VERSION); #endif #if defined(JVERSION) && defined(JCOPYRIGHT) #ifdef JCOPYRIGHT_SHORT #define JPEG_COPYRIGHT JCOPYRIGHT_SHORT #else #define JPEG_COPYRIGHT JCOPYRIGHT #endif logfmt("Version string: " JVERSION "\n" JPEG_COPYRIGHT "\n\n"); #else // Search for libjpeg copyright (to work with static and dynamic linking) { int i, n = jsrcerr.last_jpeg_message; const char *msg, *ver = NULL; for (i = 0; i < n; i++) { msg = jsrcerr.jpeg_message_table[i]; if (msg && !memcmp(msg, "Copyright", 9)) break; } if (i < n) { if (i + 1 < n) { // version should be next to copyright ver = jsrcerr.jpeg_message_table[i + 1]; // check that it starts with a number if (ver && (ver[0] < '0' || ver[0] > '9')) ver = NULL; } if (!ver) ver = "not found"; logfmt("Version string: %s\n%s\n\n", ver, msg); } else { logfmt("Copyright not found\n\n"); } } #endif jpeg_verbose--; #ifndef WASM if (argc == 1) return 1; #endif } #ifndef WASM if (argc != 3) { logfmt( "JPEG Quant Smooth : " JPEGQS_COPYRIGHT " : " JPEGQS_VERSION "\n" "Build date: " __DATE__ "\n" "Uses libjpeg, run with \"--verbose 1\" to show its version and copyright\n" "\n" "Usage:\n" " " LS " [options] input.jpg output.jpg\n" "\n" "Options:\n" " -q, --quality n Quality setting (1-6, default is 3)\n" " -n, --niter n Number of iterations (default is 3)\n" " -t, --threads n Set the number of CPU threads to use\n" " -o, --optimize Option for libjpeg to produce smaller output file\n" " -v, --verbose n Print libjpeg debug output\n" " -i, --info n Print quantsmooth debug output (default is 15)\n" " Use the sum of flags: 0 - silent,\n" " 1/2/4 - various information,\n" " 8 - processing time, 16 - SIMD type.\n" " -p, --cpu n Use to lower the SIMD type if CPU detection fails:\n" " 0 - auto, 1 - scalar, 2 - SSE2, 3 - AVX2, 4 - AVX512.\n" " (x86 build selects between modes 1-3, x86_64 from 2-4)\n" "\n", progname); return 1; } #endif jpeg_create_decompress(&srcinfo); dstinfo.err = jpeg_std_error(&jdsterr); jpeg_create_compress(&dstinfo); jsrcerr.trace_level = jdsterr.trace_level = jpeg_verbose; srcinfo.mem->max_memory_to_use = dstinfo.mem->max_memory_to_use; #ifdef WASM input_mem = (uint8_t*)params[1]; input_size = params[2]; #else fn = argv[1]; #endif #ifdef MEM_INPUT #ifndef WASM input_mem = loadfile(fn, &input_size); if (!input_mem) { logfmt(LS ": can't open input file \"" LS "\"\n", progname, fn); return 1; } #endif #if 0 && (JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)) jpeg_mem_src(&srcinfo, input_mem, input_size); #else srcinfo.src = &src_mgr; src_mgr.init_source = jpeg_init_source; src_mgr.fill_input_buffer = jpeg_fill_input_buffer; src_mgr.skip_input_data = jpeg_skip_input_data; src_mgr.resync_to_restart = jpeg_resync_to_restart; /* use default method */ src_mgr.term_source = jpeg_term_source; src_mgr.next_input_byte = (const JOCTET*)input_mem; src_mgr.bytes_in_buffer = input_size; #endif #else if (strcmp(fn, "-")) { if ((input_file = fopen(fn, "rb")) == NULL) { logfmt(LS ": can't open input file \"" LS "\"\n", progname, fn); return 1; } } else { #ifdef USE_SETMODE setmode(fileno(stdin), O_BINARY); #endif } jpeg_stdio_src(&srcinfo, input_file); #endif // jcopy_markers_setup if (cmd_copy > 0) jpeg_save_markers(&srcinfo, JPEG_COM, 0xFFFF); if (cmd_copy > 1) { int i; for (i = 0; i < 16; i++) jpeg_save_markers(&srcinfo, JPEG_APP0 + i, 0xFFFF); } (void) jpeg_read_header(&srcinfo, TRUE); coef_arrays = jpeg_read_coefficients(&srcinfo); do_quantsmooth(&srcinfo, coef_arrays, &opts); jpeg_copy_critical_parameters(&srcinfo, &dstinfo); if (optimize) dstinfo.optimize_coding = TRUE; #ifdef MEM_OUTPUT // uint8_t *outbuffer; unsigned long outsize; // jpeg_mem_dest(dstinfo, &outbuffer, &outsize); dest_mgr.pub.init_destination = jpeg_init_destination; dest_mgr.pub.empty_output_buffer = jpeg_empty_output_buffer; dest_mgr.pub.term_destination = jpeg_term_destination; dstinfo.dest = (struct jpeg_destination_mgr*)&dest_mgr; #else // If output opened after reading coefs, then we can write result to input file fn = argv[2]; if (strcmp(fn, "-")) { if ((output_file = fopen(fn, "wb")) == NULL) { logfmt(LS ": can't open output file \"" LS "\"\n", progname, fn); return 1; } } else { #ifdef USE_SETMODE setmode(fileno(stdout), O_BINARY); #endif } jpeg_stdio_dest(&dstinfo, output_file); #endif /* Start compressor (note no image data is actually written here) */ jpeg_write_coefficients(&dstinfo, coef_arrays); // jcopy_markers_execute { jpeg_saved_marker_ptr marker; for (marker = srcinfo.marker_list; marker; marker = marker->next) { if (dstinfo.write_JFIF_header && marker->marker == JPEG_APP0 && marker->data_length >= 5 && !memcmp(marker->data, "JFIF", 5)) continue; if (dstinfo.write_Adobe_marker && marker->marker == JPEG_APP0 + 14 && marker->data_length >= 5 && !memcmp(marker->data, "Adobe", 5)) continue; jpeg_write_marker(&dstinfo, marker->marker, marker->data, marker->data_length); } } /* Finish compression and release memory */ jpeg_finish_compress(&dstinfo); jpeg_destroy_compress(&dstinfo); (void) jpeg_finish_decompress(&srcinfo); jpeg_destroy_decompress(&srcinfo); /* Close files, if we opened them */ #ifndef MEM_INPUT if (input_file != stdin) fclose(input_file); #endif #ifdef WASM params[3] = (intptr_t)dest_mgr.buffer; params[4] = dest_mgr.size; #else #ifdef MEM_OUTPUT fn = argv[2]; if (strcmp(fn, "-")) { if ((output_file = fopen(fn, "wb")) == NULL) { logfmt(LS ": can't open output file \"" LS "\"\n", progname, fn); return 1; } } else { #ifdef USE_SETMODE setmode(fileno(stdout), O_BINARY); #endif } if (dest_mgr.buffer) { fwrite(dest_mgr.buffer, 1, dest_mgr.size, output_file); free(dest_mgr.buffer); } #endif if (output_file != stdout) fclose(output_file); #endif return jsrcerr.num_warnings + jdsterr.num_warnings ? 2 : 0; } // for testing purposes #ifdef WASM_MAIN int main(int argc, char **argv) { int64_t params[5]; int i, n, ret; char *cmdline = NULL; size_t cmd_size = 1; const char *progname = "quantsmooth", *fn; uint8_t *input_mem; size_t input_size; if (argc < 3) { logfmt("Unrecognized command line.\n"); return 1; } for (n = 1; n < argc - 2; n++) cmd_size += strlen(argv[n]) + 3; cmdline = malloc(cmd_size); if (!cmdline) return 1; cmd_size = 0; for (i = 1; i < argc - 2; i++) { const char *str = argv[i]; int len = strlen(str); cmdline[cmd_size++] = '"'; memcpy(cmdline + cmd_size, str, len); cmd_size += len; cmdline[cmd_size++] = '"'; cmdline[cmd_size++] = ' '; } cmdline[cmd_size] = 0; params[0] = (intptr_t)cmdline; // printf("cmdline: %s\n", cmdline); argv += argc - 3; fn = argv[1]; input_mem = loadfile(fn, &input_size); if (!input_mem) { logfmt("%s: can't open input file \"%s\"\n", progname, fn); return 1; } params[1] = (intptr_t)input_mem; params[2] = input_size; params[3] = 0; params[4] = 0; ret = web_process(params); free(input_mem); if (params[3]) { FILE *output_file; fn = argv[2]; if ((output_file = fopen(fn, "wb")) == NULL) { logfmt("%s: can't open output file \"%s\"\n", progname, fn); return 1; } fwrite((void*)params[3], 1, (size_t)params[4], output_file); fclose(output_file); free((void*)params[3]); } if (cmdline) free(cmdline); return ret; } #elif defined(WASM) EMSCRIPTEN_KEEPALIVE int web_main(int64_t *params) { int ret = web_process(params); EM_ASM( setTimeout(Module["wasm_return"], 0); ); return ret; } #endif jpeg-quantsmooth-1.20210408/quantsmooth.h000066400000000000000000002124701403361770700201710ustar00rootroot00000000000000/* * Copyright (C) 2016-2021 Ilya Kurdyukov * * This file is part of jpeg quantsmooth * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #ifdef NO_MATHLIB #define roundf(x) (float)(int)((x) < 0 ? (x) - 0.5f : (x) + 0.5f) #define fabsf(x) (float)((x) < 0 ? -(x) : (x)) #else #include #endif #ifdef _OPENMP #include #else #define omp_get_thread_num() 0 #endif #if !defined(TRANSCODE_ONLY) && !defined(JPEG_INTERNALS) // declarations needed from jpegint.h #define DSTATE_SCANNING 205 #define DSTATE_RAW_OK 206 EXTERN(void) jinit_d_main_controller(j_decompress_ptr, boolean); EXTERN(void) jinit_inverse_dct(j_decompress_ptr); EXTERN(void) jinit_upsampler(j_decompress_ptr); EXTERN(void) jinit_color_deconverter(j_decompress_ptr); struct jpeg_decomp_master { void (*prepare_for_output_pass) (j_decompress_ptr); void (*finish_output_pass) (j_decompress_ptr); boolean is_dummy_pass; #ifdef LIBJPEG_TURBO_VERSION JDIMENSION first_iMCU_col, last_iMCU_col; JDIMENSION first_MCU_col[MAX_COMPONENTS]; JDIMENSION last_MCU_col[MAX_COMPONENTS]; boolean jinit_upsampler_no_alloc; #endif }; #endif #ifdef WITH_LOG #ifdef _WIN32 #define WIN32_LEAN_AND_MEAN #ifdef INT32 #undef INT32 #endif // conflict with libjpeg typedef #define INT32 INT32_WIN #include static int64_t get_time_usec() { LARGE_INTEGER freq, perf; QueryPerformanceFrequency(&freq); QueryPerformanceCounter(&perf); return perf.QuadPart * 1000000.0 / freq.QuadPart; } #else #include #include static int64_t get_time_usec() { struct timeval time; gettimeofday(&time, NULL); return time.tv_sec * (int64_t)1000000 + time.tv_usec; } #endif #endif #ifndef NO_SIMD #if defined(__SSE2__) #define USE_SSE2 #include #if defined(__SSSE3__) #include #else static inline __m128i SSE2_mm_abs_epi16(__m128i a) { __m128i t = _mm_srai_epi16(a, 15); return _mm_xor_si128(_mm_add_epi16(a, t), t); } #define _mm_abs_epi16 SSE2_mm_abs_epi16 #endif #if defined(__SSE4_1__) #define USE_SSE4 #include #else #define _mm_cvtepu8_epi16(a) _mm_unpacklo_epi8(a, _mm_setzero_si128()) // _mm_cmplt_epi16(a, _mm_setzero_si128()) or _mm_srai_epi16(a, 15) #define _mm_cvtepi16_epi32(a) _mm_unpacklo_epi16(a, _mm_srai_epi16(a, 15)) static inline __m128i SSE2_mm_mullo_epi32(__m128i a, __m128i b) { __m128i l = _mm_mul_epu32(a, b); __m128i h = _mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)); return _mm_unpacklo_epi64(_mm_unpacklo_epi32(l, h), _mm_unpackhi_epi32(l, h)); } #define _mm_mullo_epi32 SSE2_mm_mullo_epi32 #endif #endif // __SSE2__ #ifdef __AVX2__ #define USE_AVX2 #include #endif #ifdef __FMA__ #include #else #define _mm256_fmadd_ps(a, b, c) _mm256_add_ps(_mm256_mul_ps(a, b), c) #define _mm256_fmsub_ps(a, b, c) _mm256_sub_ps(_mm256_mul_ps(a, b), c) #define _mm256_fnmadd_ps(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b)) #endif #ifdef __AVX512F__ #include #define USE_AVX512 #endif #if defined(__ARM_NEON__) || defined(__aarch64__) #define USE_NEON #include // for testing on x86 #elif defined(TEST_NEON) && defined(__SSSE3__) #define USE_NEON #define DO_PRAGMA(x) _Pragma(#x) #define X(x) DO_PRAGMA(GCC diagnostic ignored #x) X(-Wunused-function) X(-Wdeprecated-declarations) #pragma GCC diagnostic push X(-Wsign-compare) X(-Woverflow) X(-Wunused-parameter) X(-Wsequence-point) X(-Wstrict-aliasing) #undef X #include "NEON_2_SSE.h" #pragma GCC diagnostic pop #warning NEON test build on x86 #elif defined(__arm__) #warning compiling for ARM without NEON support #endif #ifdef USE_NEON #if 1 && defined(__SSE2__) #define vdivq_f32 _mm_div_ps #elif !defined(__aarch64__) static inline float32x4_t NEON_vdivq_f32(float32x4_t a, float32x4_t b) { float32x4_t t = vrecpeq_f32(b); t = vmulq_f32(t, vrecpsq_f32(b, t)); t = vmulq_f32(t, vrecpsq_f32(b, t)); return vmulq_f32(a, t); } #define vdivq_f32 NEON_vdivq_f32 #endif #endif #endif // NO_SIMD #define ALIGN(n) __attribute__((aligned(n))) #include "idct.h" #ifndef JPEGQS_ATTR #define JPEGQS_ATTR static #endif #include "libjpegqs.h" static float** quantsmooth_init(int flags) { int i, n = DCTSIZE, nn = n * n, n2 = nn + n * 4; #ifdef NO_SIMD intptr_t nalign = 1; #else intptr_t nalign = 64; #endif float bcoef = flags & JPEGQS_DIAGONALS ? 4.0 : 2.0; int size = flags & JPEGQS_DIAGONALS ? nn * 4 + n * (4 - 2) : nn * 2 + n * 4; float *ptr, **tables = (float**)malloc(nn * sizeof(float*) + nn * size * sizeof(float) + nalign - 1); if (!tables) return NULL; ptr = (float*)(((intptr_t)&tables[DCTSIZE2] + nalign - 1) & -nalign); for (i = nn - 1; i >= 0; i--, ptr += size) tables[(int)jpegqs_natural_order[i]] = ptr; for (i = 0; i < DCTSIZE2; i++) { float *tab = tables[i], temp[DCTSIZE2]; int x, y, p; memset(temp, 0, sizeof(temp)); temp[i] = 1; idct_float(temp, temp); #define M1(a, b, j) \ for (y = 0; y < n - 1 + a; y++) \ for (x = 0; x < n - 1 + b; x++) { p = y * n + x; \ tab[p + j] = temp[p] - temp[(y + b) * n + x + a]; } M1(1, 0, 0) M1(0, 1, n2) #undef M1 for (y = n - 1, x = 0; x < n; x++) { tab[x * n + y] = tab[n2 + y * n + x] = 0; #define M1(a, b, j) tab[nn + n * j + x] = temp[a + b * n] * bcoef; M1(x, 0, 0) M1(x, y, 1) M1(0, x, 2) M1(y, x, 3) #undef M1 } if (flags & JPEGQS_DIAGONALS) { tab += nn * 2 + n * 4; for (y = 0; y < n - 1; y++, tab += n * 2) { for (x = 0; x < n - 1; x++) { p = y * n + x; tab[x] = temp[p] - temp[p + n + 1]; tab[x + n] = temp[p + 1] - temp[p + n]; } tab[x] = tab[x + n] = 0; } } } return tables; } #if defined(USE_JSIMD) && defined(LIBJPEG_TURBO_VERSION) #define JSIMD_CONCAT(x) jsimd_idct_islow_##x #define JSIMD_NAME(x) JSIMD_CONCAT(x) EXTERN(void) JSIMD_NAME(USE_JSIMD)(void*, JCOEFPTR, JSAMPARRAY, JDIMENSION); #define idct_islow(coef, buf, st) JSIMD_NAME(USE_JSIMD)(dct_table1, coef, output_buf, output_col) #define X 1,1,1,1, 1,1,1,1 static int16_t dct_table1[DCTSIZE2] = { X,X,X,X, X,X,X,X }; #undef X #endif static const char zigzag_refresh[DCTSIZE2] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1 }; static void fdct_clamp(float *buf, JCOEFPTR coef, UINT16 *quantval) { int x, y, n = DCTSIZE; (void)x; (void)y; fdct_float(buf, buf); #if 1 && defined(USE_NEON) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) for (y = 0; y < n; y++) { int16x8_t v0, v1, v2, v3; float32x4_t f0, f1, f2, f3, f4, f5; int32x4_t v4; v1 = vld1q_s16((int16_t*)&quantval[y * n]); v0 = vld1q_s16((int16_t*)&coef[y * n]); v3 = vshrq_n_s16(v0, 15); v2 = veorq_s16(vaddq_s16(vshrq_n_s16(v1, 1), v3), v3); v0 = vaddq_s16(v0, v2); f3 = vdupq_n_f32(0.5f); f5 = vnegq_f32(f3); #define M1(low, f0, f1, x) \ f4 = vld1q_f32(&buf[y * n + x]); \ v4 = vmovl_s16(vget_##low##_s16(v0)); \ f0 = vaddq_f32(f4, vbslq_f32(vcltq_f32(f4, vdupq_n_f32(0)), f5, f3)); \ /* correction for imprecise divide */ \ f1 = vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(v4, 31)), f5, f3); \ f4 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v1))); \ f1 = vdivq_f32(vaddq_f32(vcvtq_f32_s32(v4), f1), f4); M1(low, f0, f1, 0) M1(high, f2, f3, 4) #undef M1 v2 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(f0)), vmovn_s32(vcvtq_s32_f32(f2))); v0 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(f1)), vmovn_s32(vcvtq_s32_f32(f3))); v0 = vmulq_s16(v0, v1); /* v0 = a0, v1 = div, v2 = add */ v3 = vaddq_s16(v1, vreinterpretq_s16_u16(vcgeq_s16(v0, vdupq_n_s16(0)))); v2 = vminq_s16(v2, vaddq_s16(v0, vshrq_n_s16(v3, 1))); v3 = vaddq_s16(v1, vreinterpretq_s16_u16(vcleq_s16(v0, vdupq_n_s16(0)))); v2 = vmaxq_s16(v2, vsubq_s16(v0, vshrq_n_s16(v3, 1))); vst1q_s16((int16_t*)&coef[y * n], v2); } else #elif 1 && defined(USE_AVX512) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) for (y = 0; y < n; y += 2) { __m256i v0, v1, v2, v3; __m512 f0, f1; v1 = _mm256_loadu_si256((__m256i*)&quantval[y * n]); v0 = _mm256_loadu_si256((__m256i*)&coef[y * n]); v2 = _mm256_srli_epi16(v1, 1); v3 = _mm256_srai_epi16(v0, 15); v2 = _mm256_xor_si256(_mm256_add_epi16(v2, v3), v3); v0 = _mm256_add_epi16(v0, v2); f0 = _mm512_loadu_ps(&buf[y * n]); /* vpmovd2m, and+or (need AVX512DQ) */ f1 = _mm512_mask_blend_ps(_mm512_cmp_ps_mask(f0, _mm512_setzero_ps(), 1), _mm512_set1_ps(0.5f), _mm512_set1_ps(-0.5f)); f0 = _mm512_add_ps(f0, f1); f1 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v0)); f1 = _mm512_div_ps(f1, _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v1))); v2 = _mm512_cvtepi32_epi16(_mm512_cvttps_epi32(f0)); v0 = _mm512_cvtepi32_epi16(_mm512_cvttps_epi32(f1)); v0 = _mm256_mullo_epi16(v0, v1); /* v0 = a0, v1 = div, v2 = add */ v1 = _mm256_add_epi16(v1, _mm256_set1_epi16(-1)); v3 = _mm256_sub_epi16(v1, _mm256_srai_epi16(v0, 15)); v2 = _mm256_min_epi16(v2, _mm256_add_epi16(v0, _mm256_srai_epi16(v3, 1))); v3 = _mm256_sub_epi16(v1, _mm256_cmpgt_epi16(v0, _mm256_setzero_si256())); v2 = _mm256_max_epi16(v2, _mm256_sub_epi16(v0, _mm256_srai_epi16(v3, 1))); _mm256_storeu_si256((__m256i*)&coef[y * n], v2); } else #elif 1 && defined(USE_AVX2) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) for (y = 0; y < n; y++) { __m128i v0, v1, v2, v3; __m256i v4, v5; __m256 f0, f1; v1 = _mm_loadu_si128((__m128i*)&quantval[y * n]); v0 = _mm_loadu_si128((__m128i*)&coef[y * n]); v2 = _mm_srli_epi16(v1, 1); v3 = _mm_srai_epi16(v0, 15); v2 = _mm_xor_si128(_mm_add_epi16(v2, v3), v3); v0 = _mm_add_epi16(v0, v2); f0 = _mm256_loadu_ps(&buf[y * n]); f1 = _mm256_blendv_ps(_mm256_set1_ps(0.5f), _mm256_set1_ps(-0.5f), f0); f0 = _mm256_add_ps(f0, f1); f1 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v0)); f1 = _mm256_div_ps(f1, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v1))); v4 = _mm256_cvttps_epi32(f0); v5 = _mm256_cvttps_epi32(f1); v4 = _mm256_permute4x64_epi64(_mm256_packs_epi32(v5, v4), 0xd8); v0 = _mm256_castsi256_si128(v4); v2 = _mm256_extracti128_si256(v4, 1); v0 = _mm_mullo_epi16(v0, v1); /* v0 = a0, v1 = div, v2 = add */ v1 = _mm_add_epi16(v1, _mm_set1_epi16(-1)); v3 = _mm_sub_epi16(v1, _mm_srai_epi16(v0, 15)); v2 = _mm_min_epi16(v2, _mm_add_epi16(v0, _mm_srai_epi16(v3, 1))); v3 = _mm_sub_epi16(v1, _mm_cmpgt_epi16(v0, _mm_setzero_si128())); v2 = _mm_max_epi16(v2, _mm_sub_epi16(v0, _mm_srai_epi16(v3, 1))); _mm_storeu_si128((__m128i*)&coef[y * n], v2); } else #elif 1 && defined(USE_SSE2) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) for (y = 0; y < n; y++) { __m128i v0, v1, v2, v3; __m128 f0, f1, f2, f3, f4; v1 = _mm_loadu_si128((__m128i*)&quantval[y * n]); v0 = _mm_loadu_si128((__m128i*)&coef[y * n]); v2 = _mm_srli_epi16(v1, 1); v3 = _mm_srai_epi16(v0, 15); v2 = _mm_xor_si128(_mm_add_epi16(v2, v3), v3); v0 = _mm_add_epi16(v0, v2); v2 = _mm_setzero_si128(); v3 = _mm_srai_epi16(v0, 15); #define M1(lo, f0, f1, x) \ f0 = _mm_loadu_ps((float*)&buf[y * n + x]); \ f4 = _mm_cmplt_ps(f0, _mm_setzero_ps()); \ f4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(f4), 31)); \ f0 = _mm_add_ps(f0, _mm_or_ps(f4, _mm_set1_ps(0.5f))); \ f4 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v1, v2)); \ f1 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v0, v3)); \ f1 = _mm_div_ps(f1, f4); M1(lo, f0, f1, 0) M1(hi, f2, f3, 4) #undef M1 v2 = _mm_packs_epi32(_mm_cvttps_epi32(f0), _mm_cvttps_epi32(f2)); v0 = _mm_packs_epi32(_mm_cvttps_epi32(f1), _mm_cvttps_epi32(f3)); v0 = _mm_mullo_epi16(v0, v1); /* v0 = a0, v1 = div, v2 = add */ v1 = _mm_add_epi16(v1, _mm_set1_epi16(-1)); v3 = _mm_sub_epi16(v1, _mm_srai_epi16(v0, 15)); v2 = _mm_min_epi16(v2, _mm_add_epi16(v0, _mm_srai_epi16(v3, 1))); v3 = _mm_sub_epi16(v1, _mm_cmpgt_epi16(v0, _mm_setzero_si128())); v2 = _mm_max_epi16(v2, _mm_sub_epi16(v0, _mm_srai_epi16(v3, 1))); _mm_storeu_si128((__m128i*)&coef[y * n], v2); } else #endif for (x = 0; x < n * n; x++) { int div = quantval[x], coef1 = coef[x], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = roundf(buf[x]); if (add > dh) add = dh; if (add < dl) add = dl; coef[x] = add; } } static void quantsmooth_block(JCOEFPTR coef, UINT16 *quantval, JSAMPLE *image, JSAMPLE *image2, int stride, int flags, float **tables, int luma) { int k, n = DCTSIZE, x, y, need_refresh = 1; JSAMPLE ALIGN(32) buf[DCTSIZE2 + DCTSIZE * 6], *border = buf + n * n; #ifndef NO_SIMD int16_t ALIGN(32) temp[DCTSIZE2 * 4 + DCTSIZE * (4 - 2)]; #endif #ifdef USE_JSIMD JSAMPROW output_buf[DCTSIZE]; int output_col = 0; for (k = 0; k < n; k++) output_buf[k] = buf + k * n; #endif (void)x; if (image2) { float ALIGN(32) fbuf[DCTSIZE2]; #if 1 && defined(USE_NEON) for (y = 0; y < n; y++) { uint8x8_t h0, h1; uint16x8_t sumA, sumB, v0, v1; uint16x4_t h2, h3; float32x4_t v5, scale; uint32x4_t v4, sumAA1, sumAB1, sumAA2, sumAB2; #define M1(xx, yy) \ h0 = vld1_u8(&image2[(y + yy) * stride + xx]); \ h1 = vld1_u8(&image[(y + yy) * stride + xx]); \ sumA = vaddw_u8(sumA, h0); v0 = vmull_u8(h0, h0); \ sumB = vaddw_u8(sumB, h1); v1 = vmull_u8(h0, h1); \ sumAA1 = vaddw_u16(sumAA1, vget_low_u16(v0)); \ sumAB1 = vaddw_u16(sumAB1, vget_low_u16(v1)); \ sumAA2 = vaddw_u16(sumAA2, vget_high_u16(v0)); \ sumAB2 = vaddw_u16(sumAB2, vget_high_u16(v1)); #define M2 \ sumA = vaddq_u16(sumA, sumA); sumB = vaddq_u16(sumB, sumB); \ sumAA1 = vaddq_u32(sumAA1, sumAA1); sumAA2 = vaddq_u32(sumAA2, sumAA2); \ sumAB1 = vaddq_u32(sumAB1, sumAB1); sumAB2 = vaddq_u32(sumAB2, sumAB2); h0 = vld1_u8(&image2[y * stride]); h1 = vld1_u8(&image[y * stride]); sumA = vmovl_u8(h0); v0 = vmull_u8(h0, h0); sumB = vmovl_u8(h1); v1 = vmull_u8(h0, h1); sumAA1 = vmovl_u16(vget_low_u16(v0)); sumAB1 = vmovl_u16(vget_low_u16(v1)); sumAA2 = vmovl_u16(vget_high_u16(v0)); sumAB2 = vmovl_u16(vget_high_u16(v1)); M2 M1(0, -1) M1(-1, 0) M1(1, 0) M1(0, 1) M2 M1(-1, -1) M1(1, -1) M1(-1, 1) M1(1, 1) #undef M2 #undef M1 v0 = vmovl_u8(vld1_u8(&image2[y * stride])); #define M1(low, sumAA, sumAB, x) \ h2 = vget_##low##_u16(sumA); sumAA = vshlq_n_u32(sumAA, 4); \ h3 = vget_##low##_u16(sumB); sumAB = vshlq_n_u32(sumAB, 4); \ sumAA = vmlsl_u16(sumAA, h2, h2); sumAB = vmlsl_u16(sumAB, h2, h3); \ v4 = vtstq_u32(sumAA, sumAA); \ sumAB = vandq_u32(sumAB, v4); sumAA = vornq_u32(sumAA, v4); \ scale = vdivq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(sumAB)), \ vcvtq_f32_s32(vreinterpretq_s32_u32(sumAA))); \ scale = vmaxq_f32(scale, vdupq_n_f32(-16.0f)); \ scale = vminq_f32(scale, vdupq_n_f32(16.0f)); \ v4 = vshll_n_u16(vget_##low##_u16(v0), 4); \ v5 = vcvtq_n_f32_s32(vreinterpretq_s32_u32(vsubw_u16(v4, h2)), 4); \ v5 = vmlaq_f32(vcvtq_n_f32_u32(vmovl_u16(h3), 4), v5, scale); \ v5 = vmaxq_f32(v5, vdupq_n_f32(0)); \ v5 = vsubq_f32(v5, vdupq_n_f32(CENTERJSAMPLE)); \ v5 = vminq_f32(v5, vdupq_n_f32(CENTERJSAMPLE)); \ vst1q_f32(fbuf + y * n + x, v5); M1(low, sumAA1, sumAB1, 0) M1(high, sumAA2, sumAB2, 4) #undef M1 } #elif 1 && defined(USE_AVX2) for (y = 0; y < n; y++) { __m128i v0, v1; __m256i v2, v3, v4, sumA, sumB, sumAA, sumAB; __m256 v5, scale; #define M1(x0, y0, x1, y1) \ v0 = _mm_loadl_epi64((__m128i*)&image2[(y + y0) * stride + x0]); \ v1 = _mm_loadl_epi64((__m128i*)&image2[(y + y1) * stride + x1]); \ v2 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v0, v1)); \ v0 = _mm_loadl_epi64((__m128i*)&image[(y + y0) * stride + x0]); \ v1 = _mm_loadl_epi64((__m128i*)&image[(y + y1) * stride + x1]); \ v3 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v0, v1)); \ sumA = _mm256_add_epi16(sumA, v2); \ sumB = _mm256_add_epi16(sumB, v3); \ sumAA = _mm256_add_epi32(sumAA, _mm256_madd_epi16(v2, v2)); \ sumAB = _mm256_add_epi32(sumAB, _mm256_madd_epi16(v2, v3)); v0 = _mm_loadl_epi64((__m128i*)&image2[y * stride]); v1 = _mm_loadl_epi64((__m128i*)&image[y * stride]); sumA = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v0, v0)); sumB = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v1, v1)); sumAA = _mm256_madd_epi16(sumA, sumA); sumAB = _mm256_madd_epi16(sumA, sumB); M1(0, -1, -1, 0) M1(1, 0, 0, 1) sumA = _mm256_add_epi16(sumA, sumA); sumAA = _mm256_add_epi32(sumAA, sumAA); sumB = _mm256_add_epi16(sumB, sumB); sumAB = _mm256_add_epi32(sumAB, sumAB); M1(-1, -1, 1, -1) M1(-1, 1, 1, 1) #undef M1 v3 = _mm256_set1_epi16(1); v2 = _mm256_madd_epi16(sumA, v3); sumAA = _mm256_slli_epi32(sumAA, 4); v3 = _mm256_madd_epi16(sumB, v3); sumAB = _mm256_slli_epi32(sumAB, 4); sumAA = _mm256_sub_epi32(sumAA, _mm256_mullo_epi32(v2, v2)); sumAB = _mm256_sub_epi32(sumAB, _mm256_mullo_epi32(v2, v3)); v4 = _mm256_cmpeq_epi32(sumAA, _mm256_setzero_si256()); sumAB = _mm256_andnot_si256(v4, sumAB); scale = _mm256_cvtepi32_ps(_mm256_or_si256(sumAA, v4)); scale = _mm256_div_ps(_mm256_cvtepi32_ps(sumAB), scale); scale = _mm256_max_ps(scale, _mm256_set1_ps(-16.0f)); scale = _mm256_min_ps(scale, _mm256_set1_ps(16.0f)); v0 = _mm_loadl_epi64((__m128i*)&image2[y * stride]); v4 = _mm256_slli_epi32(_mm256_cvtepu8_epi32(v0), 4); v5 = _mm256_cvtepi32_ps(_mm256_sub_epi32(v4, v2)); // v5 = _mm256_add_ps(_mm256_mul_ps(v5, scale), _mm256_cvtepi32_ps(v3)); v5 = _mm256_fmadd_ps(v5, scale, _mm256_cvtepi32_ps(v3)); v5 = _mm256_mul_ps(v5, _mm256_set1_ps(1.0f / 16)); v5 = _mm256_max_ps(v5, _mm256_setzero_ps()); v5 = _mm256_sub_ps(v5, _mm256_set1_ps(CENTERJSAMPLE)); v5 = _mm256_min_ps(v5, _mm256_set1_ps(CENTERJSAMPLE)); _mm256_storeu_ps(fbuf + y * n, v5); } #elif 1 && defined(USE_SSE2) for (y = 0; y < n; y++) { __m128i v0, v1, v2, v3, v4, sumA, sumB, sumAA1, sumAB1, sumAA2, sumAB2; __m128 v5, scale; #define M1(x0, y0, x1, y1) \ v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[(y + y0) * stride + x0])); \ v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[(y + y1) * stride + x1])); \ v2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[(y + y0) * stride + x0])); \ v3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[(y + y1) * stride + x1])); \ sumA = _mm_add_epi16(_mm_add_epi16(sumA, v0), v1); \ sumB = _mm_add_epi16(_mm_add_epi16(sumB, v2), v3); \ v4 = _mm_unpacklo_epi16(v0, v1); sumAA1 = _mm_add_epi32(sumAA1, _mm_madd_epi16(v4, v4)); \ v1 = _mm_unpackhi_epi16(v0, v1); sumAA2 = _mm_add_epi32(sumAA2, _mm_madd_epi16(v1, v1)); \ sumAB1 = _mm_add_epi32(sumAB1, _mm_madd_epi16(v4, _mm_unpacklo_epi16(v2, v3))); \ sumAB2 = _mm_add_epi32(sumAB2, _mm_madd_epi16(v1, _mm_unpackhi_epi16(v2, v3))); v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[y * stride])); v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[y * stride])); v2 = _mm_unpacklo_epi16(v0, v0); sumAA1 = _mm_madd_epi16(v2, v2); v3 = _mm_unpacklo_epi16(v1, v1); sumAB1 = _mm_madd_epi16(v2, v3); v2 = _mm_unpackhi_epi16(v0, v0); sumAA2 = _mm_madd_epi16(v2, v2); v3 = _mm_unpackhi_epi16(v1, v1); sumAB2 = _mm_madd_epi16(v2, v3); sumA = _mm_add_epi16(v0, v0); sumB = _mm_add_epi16(v1, v1); M1(0, -1, -1, 0) M1(1, 0, 0, 1) sumA = _mm_add_epi16(sumA, sumA); sumB = _mm_add_epi16(sumB, sumB); sumAA1 = _mm_add_epi32(sumAA1, sumAA1); sumAA2 = _mm_add_epi32(sumAA2, sumAA2); sumAB1 = _mm_add_epi32(sumAB1, sumAB1); sumAB2 = _mm_add_epi32(sumAB2, sumAB2); M1(-1, -1, 1, -1) M1(-1, 1, 1, 1) #undef M1 v0 = _mm_setzero_si128(); v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[y * stride])); #define M1(lo, sumAA, sumAB, x) \ v2 = _mm_unpack##lo##_epi16(sumA, v0); sumAA = _mm_slli_epi32(sumAA, 4); \ v3 = _mm_unpack##lo##_epi16(sumB, v0); sumAB = _mm_slli_epi32(sumAB, 4); \ sumAA = _mm_sub_epi32(sumAA, _mm_mullo_epi32(v2, v2)); \ sumAB = _mm_sub_epi32(sumAB, _mm_mullo_epi32(v2, v3)); \ v4 = _mm_cmpeq_epi32(sumAA, v0); sumAB = _mm_andnot_si128(v4, sumAB); \ scale = _mm_cvtepi32_ps(_mm_or_si128(sumAA, v4)); \ scale = _mm_div_ps(_mm_cvtepi32_ps(sumAB), scale); \ scale = _mm_max_ps(scale, _mm_set1_ps(-16.0f)); \ scale = _mm_min_ps(scale, _mm_set1_ps(16.0f)); \ v4 = _mm_slli_epi32(_mm_unpack##lo##_epi16(v1, v0), 4); \ v5 = _mm_cvtepi32_ps(_mm_sub_epi32(v4, v2)); \ v5 = _mm_add_ps(_mm_mul_ps(v5, scale), _mm_cvtepi32_ps(v3)); \ v5 = _mm_mul_ps(v5, _mm_set1_ps(1.0f / 16)); \ v5 = _mm_max_ps(v5, _mm_setzero_ps()); \ v5 = _mm_sub_ps(v5, _mm_set1_ps(CENTERJSAMPLE)); \ v5 = _mm_min_ps(v5, _mm_set1_ps(CENTERJSAMPLE)); \ _mm_storeu_ps(fbuf + y * n + x, v5); M1(lo, sumAA1, sumAB1, 0) M1(hi, sumAA2, sumAB2, 4) #undef M1 } #else for (y = 0; y < n; y++) for (x = 0; x < n; x++) { float sumA = 0, sumB = 0, sumAA = 0, sumAB = 0; float divN = 1.0f / 16, scale, offset; float a; #define M1(xx, yy) { \ float a = image2[(y + yy) * stride + x + xx]; \ float b = image[(y + yy) * stride + x + xx]; \ sumA += a; sumAA += a * a; \ sumB += b; sumAB += a * b; } #define M2 sumA += sumA; sumB += sumB; \ sumAA += sumAA; sumAB += sumAB; M1(0, 0) M2 M1(0, -1) M1(-1, 0) M1(1, 0) M1(0, 1) M2 M1(-1, -1) M1(1, -1) M1(-1, 1) M1(1, 1) #undef M2 #undef M1 scale = sumAA - sumA * divN * sumA; if (scale != 0.0f) scale = (sumAB - sumA * divN * sumB) / scale; scale = scale < -16.0f ? -16.0f : scale; scale = scale > 16.0f ? 16.0f : scale; offset = (sumB - scale * sumA) * divN; a = image2[y * stride + x] * scale + offset; a = a < 0 ? 0 : a > MAXJSAMPLE + 1 ? MAXJSAMPLE + 1 : a; fbuf[y * n + x] = a - CENTERJSAMPLE; } #endif fdct_clamp(fbuf, coef, quantval); } if (flags & JPEGQS_LOW_QUALITY) { float ALIGN(32) fbuf[DCTSIZE2]; float range = 0, c0 = 2, c1 = c0 * sqrtf(0.5f); if (image2) goto end; { int sum = 0; for (x = 1; x < n * n; x++) { int a = coef[x]; a = a < 0 ? -a : a; range += quantval[x] * a; sum += a; } if (sum) range *= 4.0f / sum; if (range > CENTERJSAMPLE) range = CENTERJSAMPLE; range = roundf(range); } #if 1 && defined(USE_NEON) for (y = 0; y < n; y++) { int16x8_t v4, v5; uint16x8_t v6 = vdupq_n_u16((int)range); float32x2_t f4; uint8x8_t i0, i1; float32x4_t f0, f1, s0 = vdupq_n_f32(0), s1 = s0, s2 = s0, s3 = s0; f4 = vset_lane_f32(c1, vdup_n_f32(c0), 1); i0 = vld1_u8(&image[y * stride]); #define M1(i, x, y) \ i1 = vld1_u8(&image[(y) * stride + x]); \ v4 = vreinterpretq_s16_u16(vsubl_u8(i0, i1)); \ v5 = vreinterpretq_s16_u16(vqsubq_u16(v6, \ vreinterpretq_u16_s16(vabsq_s16(v4)))); \ M2(low, s0, s1, i) M2(high, s2, s3, i) #define M2(low, s0, s1, i) \ f0 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v5))); \ f0 = vmulq_f32(f0, f0); f1 = vmulq_lane_f32(f0, f4, i); \ f0 = vmulq_f32(f0, vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v4)))); \ s0 = vmlaq_f32(s0, f0, f1); s1 = vmlaq_f32(s1, f1, f1); M1(1, -1, y-1) M1(0, 0, y-1) M1(1, 1, y-1) M1(0, -1, y) M1(0, 1, y) M1(1, -1, y+1) M1(0, 0, y+1) M1(1, 1, y+1) #undef M1 #undef M2 v4 = vreinterpretq_s16_u16(vmovl_u8(i0)); #define M1(low, s0, s1, x) \ f1 = vbslq_f32(vceqq_f32(s1, vdupq_n_f32(0)), vdupq_n_f32(1.0f), s1); \ f0 = vdivq_f32(s0, f1); \ f1 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v4))); \ f0 = vsubq_f32(f1, f0); \ f0 = vsubq_f32(f0, vdupq_n_f32(CENTERJSAMPLE)); \ vst1q_f32(fbuf + y * n + x, f0); M1(low, s0, s1, 0) M1(high, s2, s3, 4) #undef M1 } #elif 1 && defined(USE_AVX512) for (y = 0; y < n; y += 2) { __m256i v0, v1, v4, v5, v6 = _mm256_set1_epi16((int)range); __m512 f0, f1, f4, f5, s0 = _mm512_setzero_ps(), s1 = s0; __mmask16 m0; f4 = _mm512_set1_ps(c0); f5 = _mm512_set1_ps(c1); #define M2(v0, pos) \ v0 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( \ _mm_loadl_epi64((__m128i*)&image[pos]), \ _mm_loadl_epi64((__m128i*)&image[pos + stride]))); #define M1(f4, x, y) M2(v1, (y) * stride + x) \ v4 = _mm256_sub_epi16(v0, v1); v5 = _mm256_subs_epu16(v6, _mm256_abs_epi16(v4)); \ f0 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v5)); \ f0 = _mm512_mul_ps(f0, f0); f1 = _mm512_mul_ps(f0, f4); \ f0 = _mm512_mul_ps(f0, _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v4))); \ s0 = _mm512_fmadd_ps(f0, f1, s0); s1 = _mm512_fmadd_ps(f1, f1, s1); M2(v0, y * stride) M1(f5, -1, y-1) M1(f4, 0, y-1) M1(f5, 1, y-1) M1(f4, -1, y) M1(f4, 1, y) M1(f5, -1, y+1) M1(f4, 0, y+1) M1(f5, 1, y+1) #undef M1 #undef M2 m0 = _mm512_cmp_ps_mask(s1, _mm512_setzero_ps(), 0); s1 = _mm512_mask_blend_ps(m0, s1, _mm512_set1_ps(1.0f)); f0 = _mm512_div_ps(s0, s1); f1 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v0)); f0 = _mm512_sub_ps(f1, f0); f0 = _mm512_sub_ps(f0, _mm512_set1_ps(CENTERJSAMPLE)); _mm512_storeu_ps(fbuf + y * n, f0); } #elif 1 && defined(USE_AVX2) for (y = 0; y < n; y++) { __m128i v0, v1, v4, v5, v6 = _mm_set1_epi16((int)range); __m256 f0, f1, f4, f5, s0 = _mm256_setzero_ps(), s1 = s0; f4 = _mm256_set1_ps(c0); f5 = _mm256_set1_ps(c1); v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[y * stride])); #define M1(f4, x, y) \ v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[(y) * stride + x])); \ v4 = _mm_sub_epi16(v0, v1); v5 = _mm_subs_epu16(v6, _mm_abs_epi16(v4)); \ f0 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v5)); \ f0 = _mm256_mul_ps(f0, f0); f1 = _mm256_mul_ps(f0, f4); \ f0 = _mm256_mul_ps(f0, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v4))); \ s0 = _mm256_fmadd_ps(f0, f1, s0); s1 = _mm256_fmadd_ps(f1, f1, s1); M1(f5, -1, y-1) M1(f4, 0, y-1) M1(f5, 1, y-1) M1(f4, -1, y) M1(f4, 1, y) M1(f5, -1, y+1) M1(f4, 0, y+1) M1(f5, 1, y+1) #undef M1 f1 = _mm256_cmp_ps(s1, _mm256_setzero_ps(), 0); s1 = _mm256_blendv_ps(s1, _mm256_set1_ps(1.0f), f1); f0 = _mm256_div_ps(s0, s1); f1 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v0)); f0 = _mm256_sub_ps(f1, f0); f0 = _mm256_sub_ps(f0, _mm256_set1_ps(CENTERJSAMPLE)); _mm256_storeu_ps(fbuf + y * n, f0); } #elif 1 && defined(USE_SSE2) for (y = 0; y < n; y++) { __m128i v0, v1, v3, v4, v5, v6 = _mm_set1_epi16((int)range), v7 = _mm_setzero_si128(); __m128 f0, f1, f4, f5, s0 = _mm_setzero_ps(), s1 = s0, s2 = s0, s3 = s0; f4 = _mm_set1_ps(c0); f5 = _mm_set1_ps(c1); v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[y * stride])); #define M1(f4, x, y) \ v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[(y) * stride + x])); \ v4 = _mm_sub_epi16(v0, v1); v3 = _mm_srai_epi16(v4, 15); \ v5 = _mm_subs_epu16(v6, _mm_abs_epi16(v4)); \ M2(lo, s0, s1, f4) M2(hi, s2, s3, f4) #define M2(lo, s0, s1, f4) \ f0 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v5, v7)); \ f0 = _mm_mul_ps(f0, f0); f1 = _mm_mul_ps(f0, f4); \ f0 = _mm_mul_ps(f0, _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v4, v3))); \ f0 = _mm_mul_ps(f0, f1); f1 = _mm_mul_ps(f1, f1); \ s0 = _mm_add_ps(s0, f0); s1 = _mm_add_ps(s1, f1); M1(f5, -1, y-1) M1(f4, 0, y-1) M1(f5, 1, y-1) M1(f4, -1, y) M1(f4, 1, y) M1(f5, -1, y+1) M1(f4, 0, y+1) M1(f5, 1, y+1) #undef M1 #undef M2 #define M1(lo, s0, s1, x) \ f1 = _mm_cmpeq_ps(s1, _mm_setzero_ps()); \ f1 = _mm_and_ps(f1, _mm_set1_ps(1.0f)); \ f0 = _mm_div_ps(s0, _mm_or_ps(s1, f1)); \ f1 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v0, v7)); \ f0 = _mm_sub_ps(f1, f0); \ f0 = _mm_sub_ps(f0, _mm_set1_ps(CENTERJSAMPLE)); \ _mm_storeu_ps(fbuf + y * n + x, f0); M1(lo, s0, s1, 0) M1(hi, s2, s3, 4) #undef M1 } #else for (y = 0; y < n; y++) for (x = 0; x < n; x++) { #define M1(i, x, y) t0 = a - image[(y) * stride + x]; \ t = range - fabsf(t0); if (t < 0) t = 0; t *= t; aw = c##i * t; \ a0 += t0 * t * aw; an += aw * aw; int a = image[(y)*stride+(x)]; float a0 = 0, an = 0, aw, t, t0; M1(1, x-1, y-1) M1(0, x, y-1) M1(1, x+1, y-1) M1(0, x-1, y) M1(0, x+1, y) M1(1, x-1, y+1) M1(0, x, y+1) M1(1, x+1, y+1) #undef M1 if (an > 0.0f) a -= a0 / an; fbuf[y * n + x] = a - CENTERJSAMPLE; } #endif fdct_clamp(fbuf, coef, quantval); goto end; } #if 1 && defined(USE_NEON) #define VINITD uint8x8_t i0, i1, i2; #define VDIFF(i) vst1q_u16((uint16_t*)temp + (i) * n, vsubl_u8(i0, i1)); #define VLDPIX(j, p) i##j = vld1_u8(p); #define VRIGHT(a, b) i##a = vext_u8(i##b, i##b, 1); #define VCOPY(a, b) i##a = i##b; #define VINIT \ int16x8_t v0, v5; uint16x8_t v6 = vdupq_n_u16(range); \ float32x4_t f0, f1, s0 = vdupq_n_f32(0), s1 = s0, s2 = s0, s3 = s0; #define VCORE \ v0 = vld1q_s16(temp + y * n); \ v5 = vreinterpretq_s16_u16(vqsubq_u16(v6, \ vreinterpretq_u16_s16(vabsq_s16(v0)))); \ VCORE1(low, s0, s1, tab) VCORE1(high, s2, s3, tab + 4) #define VCORE1(low, s0, s1, tab) \ f0 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v5))); \ f0 = vmulq_f32(f0, f0); f1 = vmulq_f32(f0, vld1q_f32(tab + y * n)); \ f0 = vmulq_f32(f0, vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v0)))); \ s0 = vmlaq_f32(s0, f0, f1); s1 = vmlaq_f32(s1, f1, f1); #ifdef __aarch64__ #define VFIN \ a2 = vaddvq_f32(vaddq_f32(s0, s2)); \ a3 = vaddvq_f32(vaddq_f32(s1, s3)); #else #define VFIN { \ float32x4x2_t p0; float32x2_t v0; \ p0 = vzipq_f32(vaddq_f32(s0, s2), vaddq_f32(s1, s3)); \ f0 = vaddq_f32(p0.val[0], p0.val[1]); \ v0 = vadd_f32(vget_low_f32(f0), vget_high_f32(f0)); \ a2 = vget_lane_f32(v0, 0); a3 = vget_lane_f32(v0, 1); \ } #endif #elif 1 && defined(USE_AVX512) #define VINCR 2 #define VINIT \ __m256i v4, v5, v6 = _mm256_set1_epi16(range); \ __m512 f0, f1, f4, s0 = _mm512_setzero_ps(), s1 = s0; #define VCORE \ v4 = _mm256_loadu_si256((__m256i*)&temp[y * n]); \ f4 = _mm512_load_ps(tab + y * n); \ v5 = _mm256_subs_epu16(v6, _mm256_abs_epi16(v4)); \ f0 = _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v5)); \ f0 = _mm512_mul_ps(f0, f0); f1 = _mm512_mul_ps(f0, f4); \ f0 = _mm512_mul_ps(f0, _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(v4))); \ s0 = _mm512_fmadd_ps(f0, f1, s0); s1 = _mm512_fmadd_ps(f1, f1, s1); // "reduce_add" is not faster here, because it's a macro, not a single instruction // a2 = _mm512_reduce_add_ps(s0); a3 = _mm512_reduce_add_ps(s1); #define VFIN { __m256 s2, s3, f2; \ f0 = _mm512_shuffle_f32x4(s0, s1, 0x44); \ f1 = _mm512_shuffle_f32x4(s0, s1, 0xee); \ f0 = _mm512_add_ps(f0, f1); s2 = _mm512_castps512_ps256(f0); \ s3 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(f0), 1)); \ f2 = _mm256_permute2f128_ps(s2, s3, 0x20); \ f2 = _mm256_add_ps(f2, _mm256_permute2f128_ps(s2, s3, 0x31)); \ f2 = _mm256_add_ps(f2, _mm256_shuffle_ps(f2, f2, 0xee)); \ f2 = _mm256_add_ps(f2, _mm256_shuffle_ps(f2, f2, 0x55)); \ a2 = _mm256_cvtss_f32(f2); \ a3 = _mm_cvtss_f32(_mm256_extractf128_ps(f2, 1)); } #elif 1 && defined(USE_AVX2) #define VINIT \ __m128i v4, v5, v6 = _mm_set1_epi16(range); \ __m256 f0, f1, f4, s0 = _mm256_setzero_ps(), s1 = s0; #define VCORE \ v4 = _mm_loadu_si128((__m128i*)&temp[y * n]); \ f4 = _mm256_load_ps(tab + y * n); \ v5 = _mm_subs_epu16(v6, _mm_abs_epi16(v4)); \ f0 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v5)); \ f0 = _mm256_mul_ps(f0, f0); f1 = _mm256_mul_ps(f0, f4); \ f0 = _mm256_mul_ps(f0, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v4))); \ s0 = _mm256_fmadd_ps(f0, f1, s0); s1 = _mm256_fmadd_ps(f1, f1, s1); #define VFIN \ f0 = _mm256_permute2f128_ps(s0, s1, 0x20); \ f1 = _mm256_permute2f128_ps(s0, s1, 0x31); \ f0 = _mm256_add_ps(f0, f1); \ f0 = _mm256_add_ps(f0, _mm256_shuffle_ps(f0, f0, 0xee)); \ f0 = _mm256_add_ps(f0, _mm256_shuffle_ps(f0, f0, 0x55)); \ a2 = _mm256_cvtss_f32(f0); \ a3 = _mm_cvtss_f32(_mm256_extractf128_ps(f0, 1)); #elif 1 && defined(USE_SSE2) #define VINIT \ __m128i v3, v4, v5, v6 = _mm_set1_epi16(range), v7 = _mm_setzero_si128(); \ __m128 f0, f1, s0 = _mm_setzero_ps(), s1 = s0, s2 = s0, s3 = s0; #define VCORE \ v4 = _mm_loadu_si128((__m128i*)&temp[y * n]); \ v3 = _mm_srai_epi16(v4, 15); \ v5 = _mm_subs_epu16(v6, _mm_abs_epi16(v4)); \ VCORE1(lo, s0, s1, tab) VCORE1(hi, s2, s3, tab + 4) #define VCORE1(lo, s0, s1, tab) \ f0 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v5, v7)); \ f0 = _mm_mul_ps(f0, f0); \ f1 = _mm_mul_ps(f0, _mm_load_ps(tab + y * n)); \ f0 = _mm_mul_ps(f0, _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v4, v3))); \ f0 = _mm_mul_ps(f0, f1); f1 = _mm_mul_ps(f1, f1); \ s0 = _mm_add_ps(s0, f0); s1 = _mm_add_ps(s1, f1); #define VFIN \ f0 = _mm_add_ps(s0, s2); f1 = _mm_add_ps(s1, s3); \ f0 = _mm_add_ps(_mm_unpacklo_ps(f0, f1), _mm_unpackhi_ps(f0, f1)); \ f0 = _mm_add_ps(f0, _mm_shuffle_ps(f0, f0, 0xee)); \ a2 = _mm_cvtss_f32(f0); \ a3 = _mm_cvtss_f32(_mm_shuffle_ps(f0, f0, 0x55)); #elif !defined(NO_SIMD) // vector code simulation #define VINITD JSAMPLE *p0, *p1, *p2; #define VDIFF(i) for (x = 0; x < n; x++) temp[(i) * n + x] = p0[x] - p1[x]; #define VLDPIX(i, a) p##i = a; #define VRIGHT(a, b) p##a = p##b + 1; #define VCOPY(a, b) p##a = p##b; #define VINIT int j; float a0, a1, f0, sum[DCTSIZE * 2]; \ for (j = 0; j < n * 2; j++) sum[j] = 0; #define VCORE \ for (j = 0; j < n; j++) { \ a0 = temp[y * n + j]; a1 = tab[y * n + j]; \ f0 = (float)range - fabsf(a0); if (f0 < 0) f0 = 0; f0 *= f0; \ a0 *= f0; a1 *= f0; a0 *= a1; a1 *= a1; \ sum[j] += a0; sum[j + n] += a1; \ } #define VCORE1(sum) \ ((sum[0] + sum[4]) + (sum[1] + sum[5])) + \ ((sum[2] + sum[6]) + (sum[3] + sum[7])); #define VFIN a2 += VCORE1(sum) a3 += VCORE1((sum+8)) #endif for (y = 0; y < n; y++) { border[y + n * 2] = image[y - stride]; border[y + n * 3] = image[y + stride * n]; border[y + n * 4] = image[y * stride - 1]; border[y + n * 5] = image[y * stride + n]; } for (k = n * n - 1; k > 0; k--) { int i = jpegqs_natural_order[k]; float *tab = tables[i], a2 = 0, a3 = 0; int range = quantval[i] * 2; if (need_refresh && zigzag_refresh[i]) { idct_islow(coef, buf, n); need_refresh = 0; #ifdef VINIT for (y = 0; y < n; y++) { border[y] = buf[y * n]; border[y + n] = buf[y * n + n - 1]; } #ifndef VINITD // same for SSE2, AVX2, AVX512 #define VINITD __m128i v0, v1, v2; #define VDIFF(i) _mm_storeu_si128((__m128i*)&temp[(i) * n], _mm_sub_epi16(v0, v1)); #define VLDPIX(i, p) v##i = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(p))); #define VRIGHT(a, b) v##a = _mm_bsrli_si128(v##b, 2); #define VCOPY(a, b) v##a = v##b; #endif { VINITD VLDPIX(0, buf) VLDPIX(1, border + n * 2) VDIFF(n) VRIGHT(1, 0) VDIFF(0) for (y = 1; y < n; y++) { VLDPIX(1, buf + y * n) VDIFF(y + n + 3) VCOPY(0, 1) VRIGHT(1, 0) VDIFF(y) } VLDPIX(1, border + n * 3) VDIFF(n + 1) VLDPIX(0, border) VLDPIX(1, border + n * 4) VDIFF(n + 2) VLDPIX(0, border + n) VLDPIX(1, border + n * 5) VDIFF(n + 3) if (flags & JPEGQS_DIAGONALS) { VLDPIX(0, buf) for (y = 0; y < n - 1; y++) { VLDPIX(2, buf + y * n + n) VRIGHT(1, 2) VDIFF(n * 2 + 4 + y * 2) VRIGHT(0, 0) VCOPY(1, 2) VDIFF(n * 2 + 4 + y * 2 + 1) VCOPY(0, 2) } } } #undef VINITD #undef VLDPIX #undef VRIGHT #undef VCOPY #undef VDIFF #endif } #ifdef VINIT #ifndef VINCR #define VINCR 1 #endif { int y0 = i & (n - 1) ? 0 : n; int y1 = (i >= n ? n - 1 : 0) + n + 4; VINIT for (y = y0; y < y1; y += VINCR) { VCORE } if (flags & JPEGQS_DIAGONALS) { y0 = n * 2 + 4; y1 = y0 + (n - 1) * 2; for (y = y0; y < y1; y += VINCR) { VCORE } } VFIN } #undef VINCR #undef VINIT #undef VCORE #ifdef VCORE1 #undef VCORE1 #endif #undef VFIN #else { int p; float a0, a1, t; #define CORE t = (float)range - fabsf(a0); \ if (t < 0) t = 0; t *= t; a0 *= t; a1 *= t; a2 += a0 * a1; a3 += a1 * a1; #define M1(a, b) \ for (y = 0; y < n - 1 + a; y++) \ for (x = 0; x < n - 1 + b; x++) { p = y * n + x; \ a0 = buf[p] - buf[(y + b) * n + x + a]; a1 = tab[p]; CORE } #define M2(z, i) for (z = 0; z < n; z++) { p = y * n + x; \ a0 = buf[p] - border[i * n + z]; a1 = *tab++; CORE } if (i & (n - 1)) M1(1, 0) tab += n * n; y = 0; M2(x, 2) y = n - 1; M2(x, 3) x = 0; M2(y, 4) x = n - 1; M2(y, 5) if (i > (n - 1)) M1(0, 1) if (flags & JPEGQS_DIAGONALS) { tab += n * n; for (y = 0; y < n - 1; y++, tab += n * 2) for (x = 0; x < n - 1; x++) { p = y * n + x; a0 = buf[p] - buf[p + n + 1]; a1 = tab[x]; CORE a0 = buf[p + 1] - buf[p + n]; a1 = tab[x + n]; CORE } } #undef M2 #undef M1 #undef CORE } #endif a2 = a2 / a3; range = roundf(a2); if (range) { int div = quantval[i], coef1 = coef[i], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = coef1 - range; if (add > dh) add = dh; if (add < dl) add = dl; coef[i] = add; need_refresh |= add ^ coef1; } } end: if (flags & JPEGQS_NO_REBALANCE) return; if (!luma && flags & JPEGQS_NO_REBALANCE_UV) return; #if 1 && defined(USE_NEON) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) { JCOEF orig[DCTSIZE2]; int coef0 = coef[0]; int32_t m0, m1; int32x4_t s0 = vdupq_n_s32(0), s1 = s0; coef[0] = 0; for (k = 0; k < DCTSIZE2; k += 8) { int16x8_t v0, v1, v2, v3; float32x4_t f0, f3, f4, f5; int32x4_t v4; v1 = vld1q_s16((int16_t*)&quantval[k]); v0 = vld1q_s16((int16_t*)&coef[k]); v3 = vshrq_n_s16(v0, 15); v2 = veorq_s16(vaddq_s16(vshrq_n_s16(v1, 1), v3), v3); v2 = vaddq_s16(v0, v2); f3 = vdupq_n_f32(0.5f); f5 = vnegq_f32(f3); #define M1(low, f0) \ v4 = vmovl_s16(vget_##low##_s16(v2)); \ f0 = vbslq_f32(vreinterpretq_u32_s32(vshrq_n_s32(v4, 31)), f5, f3); \ f4 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v1))); \ f0 = vdivq_f32(vaddq_f32(vcvtq_f32_s32(v4), f0), f4); M1(low, f0) M1(high, f3) #undef M1 v2 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(f0)), vmovn_s32(vcvtq_s32_f32(f3))); v2 = vmulq_s16(v2, v1); vst1q_s16((int16_t*)&orig[k], v2); #define M1(low) \ s0 = vmlal_s16(s0, vget_##low##_s16(v0), vget_##low##_s16(v2)); \ s1 = vmlal_s16(s1, vget_##low##_s16(v2), vget_##low##_s16(v2)); M1(low) M1(high) #undef M1 } { #ifdef __aarch64__ m0 = vaddvq_s32(s0); m1 = vaddvq_s32(s1); #else int32x4x2_t v0 = vzipq_s32(s0, s1); int32x2_t v1; s0 = vaddq_s32(v0.val[0], v0.val[1]); v1 = vadd_s32(vget_low_s32(s0), vget_high_s32(s0)); m0 = vget_lane_s32(v1, 0); m1 = vget_lane_s32(v1, 1); #endif } if (m1 > m0) { int mul = (((int64_t)m1 << 13) + (m0 >> 1)) / m0; int16x8_t v4 = vdupq_n_s16(mul); for (k = 0; k < DCTSIZE2; k += 8) { int16x8_t v0, v1, v2, v3; v1 = vld1q_s16((int16_t*)&quantval[k]); v2 = vld1q_s16((int16_t*)&coef[k]); v2 = vqrdmulhq_s16(vshlq_n_s16(v2, 2), v4); v0 = vld1q_s16((int16_t*)&orig[k]); v3 = vaddq_s16(v1, vreinterpretq_s16_u16(vcgeq_s16(v0, vdupq_n_s16(0)))); v2 = vminq_s16(v2, vaddq_s16(v0, vshrq_n_s16(v3, 1))); v3 = vaddq_s16(v1, vreinterpretq_s16_u16(vcleq_s16(v0, vdupq_n_s16(0)))); v2 = vmaxq_s16(v2, vsubq_s16(v0, vshrq_n_s16(v3, 1))); vst1q_s16((int16_t*)&coef[k], v2); } } coef[0] = coef0; } else #elif 1 && defined(USE_AVX2) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) { JCOEF orig[DCTSIZE2]; int coef0 = coef[0]; int32_t m0, m1; __m128i s0 = _mm_setzero_si128(), s1 = s0; coef[0] = 0; for (k = 0; k < DCTSIZE2; k += 8) { __m128i v0, v1, v2, v3; __m256i v4; __m256 f0; v1 = _mm_loadu_si128((__m128i*)&quantval[k]); v0 = _mm_loadu_si128((__m128i*)&coef[k]); v2 = _mm_srli_epi16(v1, 1); v3 = _mm_srai_epi16(v0, 15); v2 = _mm_xor_si128(_mm_add_epi16(v2, v3), v3); v2 = _mm_add_epi16(v0, v2); f0 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v2)); f0 = _mm256_div_ps(f0, _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(v1))); v4 = _mm256_cvttps_epi32(f0); v2 = _mm_packs_epi32(_mm256_castsi256_si128(v4), _mm256_extractf128_si256(v4, 1)); v2 = _mm_mullo_epi16(v2, v1); _mm_storeu_si128((__m128i*)&orig[k], v2); s0 = _mm_add_epi32(s0, _mm_madd_epi16(v0, v2)); s1 = _mm_add_epi32(s1, _mm_madd_epi16(v2, v2)); } s0 = _mm_hadd_epi32(s0, s1); s0 = _mm_hadd_epi32(s0, s0); m0 = _mm_cvtsi128_si32(s0); m1 = _mm_extract_epi32(s0, 1); if (m1 > m0) { int mul = (((int64_t)m1 << 13) + (m0 >> 1)) / m0; __m256i v4 = _mm256_set1_epi16(mul); for (k = 0; k < DCTSIZE2; k += 16) { __m256i v0, v1, v2, v3; v1 = _mm256_loadu_si256((__m256i*)&quantval[k]); v2 = _mm256_loadu_si256((__m256i*)&coef[k]); v2 = _mm256_mulhrs_epi16(_mm256_slli_epi16(v2, 2), v4); v0 = _mm256_loadu_si256((__m256i*)&orig[k]); v1 = _mm256_add_epi16(v1, _mm256_set1_epi16(-1)); v3 = _mm256_sub_epi16(v1, _mm256_srai_epi16(v0, 15)); v2 = _mm256_min_epi16(v2, _mm256_add_epi16(v0, _mm256_srai_epi16(v3, 1))); v3 = _mm256_sub_epi16(v1, _mm256_cmpgt_epi16(v0, _mm256_setzero_si256())); v2 = _mm256_max_epi16(v2, _mm256_sub_epi16(v0, _mm256_srai_epi16(v3, 1))); _mm256_storeu_si256((__m256i*)&coef[k], v2); } } coef[0] = coef0; } else #elif 1 && defined(USE_SSE2) if (sizeof(quantval[0]) == 2 && sizeof(quantval[0]) == sizeof(coef[0])) { JCOEF orig[DCTSIZE2]; int coef0 = coef[0]; int32_t m0, m1; __m128i s0 = _mm_setzero_si128(), s1 = s0; coef[0] = 0; for (k = 0; k < DCTSIZE2; k += 8) { __m128i v0, v1, v2, v3, v7; __m128 f0, f2, f4; v1 = _mm_loadu_si128((__m128i*)&quantval[k]); v0 = _mm_loadu_si128((__m128i*)&coef[k]); v2 = _mm_srli_epi16(v1, 1); v3 = _mm_srai_epi16(v0, 15); v2 = _mm_xor_si128(_mm_add_epi16(v2, v3), v3); v2 = _mm_add_epi16(v0, v2); v7 = _mm_setzero_si128(); v3 = _mm_srai_epi16(v2, 15); #define M1(lo, f0) \ f4 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v1, v7)); \ f0 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v2, v3)); \ f0 = _mm_div_ps(f0, f4); M1(lo, f0) M1(hi, f2) #undef M1 v2 = _mm_packs_epi32(_mm_cvttps_epi32(f0), _mm_cvttps_epi32(f2)); v2 = _mm_mullo_epi16(v2, v1); _mm_storeu_si128((__m128i*)&orig[k], v2); s0 = _mm_add_epi32(s0, _mm_madd_epi16(v0, v2)); s1 = _mm_add_epi32(s1, _mm_madd_epi16(v2, v2)); } #ifdef USE_SSE4 s0 = _mm_hadd_epi32(s0, s1); s0 = _mm_hadd_epi32(s0, s0); m0 = _mm_cvtsi128_si32(s0); m1 = _mm_extract_epi32(s0, 1); #else s0 = _mm_add_epi32(_mm_unpacklo_epi32(s0, s1), _mm_unpackhi_epi32(s0, s1)); s0 = _mm_add_epi32(s0, _mm_bsrli_si128(s0, 8)); m0 = _mm_cvtsi128_si32(s0); m1 = _mm_cvtsi128_si32(_mm_bsrli_si128(s0, 4)); #endif if (m1 > m0) { int mul = (((int64_t)m1 << 13) + (m0 >> 1)) / m0; __m128i v4 = _mm_set1_epi16(mul); for (k = 0; k < DCTSIZE2; k += 8) { __m128i v0, v1, v2, v3 = _mm_set1_epi16(-1); v1 = _mm_loadu_si128((__m128i*)&quantval[k]); v2 = _mm_loadu_si128((__m128i*)&coef[k]); #ifdef USE_SSE4 v2 = _mm_mulhrs_epi16(_mm_slli_epi16(v2, 2), v4); #else v2 = _mm_mulhi_epi16(_mm_slli_epi16(v2, 4), v4); v2 = _mm_srai_epi16(_mm_sub_epi16(v2, v3), 1); #endif v0 = _mm_loadu_si128((__m128i*)&orig[k]); v1 = _mm_add_epi16(v1, v3); v3 = _mm_sub_epi16(v1, _mm_srai_epi16(v0, 15)); v2 = _mm_min_epi16(v2, _mm_add_epi16(v0, _mm_srai_epi16(v3, 1))); v3 = _mm_sub_epi16(v1, _mm_cmpgt_epi16(v0, _mm_setzero_si128())); v2 = _mm_max_epi16(v2, _mm_sub_epi16(v0, _mm_srai_epi16(v3, 1))); _mm_storeu_si128((__m128i*)&coef[k], v2); } } coef[0] = coef0; } else #endif { JCOEF orig[DCTSIZE2]; int64_t m0 = 0, m1 = 0; for (k = 1; k < DCTSIZE2; k++) { int div = quantval[k], coef1 = coef[k], d1 = div >> 1; int a0 = (coef1 + (coef1 < 0 ? -d1 : d1)) / div * div; orig[k] = a0; m0 += coef1 * a0; m1 += a0 * a0; } if (m1 > m0) { int mul = ((m1 << 13) + (m0 >> 1)) / m0; for (k = 1; k < DCTSIZE2; k++) { int div = quantval[k], coef1 = coef[k], add; int dh, dl, d0 = (div - 1) >> 1, d1 = div >> 1; int a0 = orig[k]; dh = a0 + (a0 < 0 ? d1 : d0); dl = a0 - (a0 > 0 ? d1 : d0); add = (coef1 * mul + 0x1000) >> 13; if (add > dh) add = dh; if (add < dl) add = dl; coef[k] = add; } } } } static void upsample_row(int w1, int y0, int y1, JSAMPLE *image, JSAMPLE *image2, int stride, JSAMPLE *image1, int stride1, JSAMPLE *mem, int st, int ww, int ws, int hs) { float ALIGN(32) fbuf[DCTSIZE2]; int x, y, xx, yy, n = DCTSIZE; image += (y0 + 1) * stride + 1; image2 += (y0 + 1) * stride + 1; image1 += (y0 * hs + 1) * stride1 + 1; mem += y0 * hs * st; y1 -= y0; for (xx = 0; xx < w1; xx += n, image += n, image2 += n) { JSAMPLE *p1 = image1 + xx * ws, *out = mem + xx * ws; #if 1 && defined(USE_NEON) for (y = 0; y < n; y++) { uint8x8_t h0, h1; uint16x8_t sumA, sumB, v0, v1; uint16x4_t h2, h3; float32x4_t v5, scale; uint32x4_t v4, sumAA1, sumAB1, sumAA2, sumAB2; #define M1(xx, yy) \ h0 = vld1_u8(&image2[(y + yy) * stride + xx]); \ h1 = vld1_u8(&image[(y + yy) * stride + xx]); \ sumA = vaddw_u8(sumA, h0); v0 = vmull_u8(h0, h0); \ sumB = vaddw_u8(sumB, h1); v1 = vmull_u8(h0, h1); \ sumAA1 = vaddw_u16(sumAA1, vget_low_u16(v0)); \ sumAB1 = vaddw_u16(sumAB1, vget_low_u16(v1)); \ sumAA2 = vaddw_u16(sumAA2, vget_high_u16(v0)); \ sumAB2 = vaddw_u16(sumAB2, vget_high_u16(v1)); #define M2 \ sumA = vaddq_u16(sumA, sumA); sumB = vaddq_u16(sumB, sumB); \ sumAA1 = vaddq_u32(sumAA1, sumAA1); sumAA2 = vaddq_u32(sumAA2, sumAA2); \ sumAB1 = vaddq_u32(sumAB1, sumAB1); sumAB2 = vaddq_u32(sumAB2, sumAB2); h0 = vld1_u8(&image2[y * stride]); h1 = vld1_u8(&image[y * stride]); sumA = vmovl_u8(h0); v0 = vmull_u8(h0, h0); sumB = vmovl_u8(h1); v1 = vmull_u8(h0, h1); sumAA1 = vmovl_u16(vget_low_u16(v0)); sumAB1 = vmovl_u16(vget_low_u16(v1)); sumAA2 = vmovl_u16(vget_high_u16(v0)); sumAB2 = vmovl_u16(vget_high_u16(v1)); M2 M1(0, -1) M1(-1, 0) M1(1, 0) M1(0, 1) M2 M1(-1, -1) M1(1, -1) M1(-1, 1) M1(1, 1) #undef M2 #undef M1 v0 = vmovl_u8(vld1_u8(&image2[y * stride])); #define M1(low, sumAA, sumAB, x) \ h2 = vget_##low##_u16(sumA); sumAA = vshlq_n_u32(sumAA, 4); \ h3 = vget_##low##_u16(sumB); sumAB = vshlq_n_u32(sumAB, 4); \ sumAA = vmlsl_u16(sumAA, h2, h2); sumAB = vmlsl_u16(sumAB, h2, h3); \ v4 = vtstq_u32(sumAA, sumAA); \ sumAB = vandq_u32(sumAB, v4); sumAA = vornq_u32(sumAA, v4); \ scale = vdivq_f32(vcvtq_f32_s32(vreinterpretq_s32_u32(sumAB)), \ vcvtq_f32_s32(vreinterpretq_s32_u32(sumAA))); \ scale = vmaxq_f32(scale, vdupq_n_f32(-16.0f)); \ scale = vminq_f32(scale, vdupq_n_f32(16.0f)); \ v5 = scale; \ vst1q_f32(fbuf + y * n + x, v5); M1(low, sumAA1, sumAB1, 0) M1(high, sumAA2, sumAB2, 4) #undef M1 } #elif 1 && defined(USE_AVX2) for (y = 0; y < n; y++) { __m128i v0, v1; __m256i v2, v3, v4, sumA, sumB, sumAA, sumAB; __m256 v5, scale; #define M1(x0, y0, x1, y1) \ v0 = _mm_loadl_epi64((__m128i*)&image2[(y + y0) * stride + x0]); \ v1 = _mm_loadl_epi64((__m128i*)&image2[(y + y1) * stride + x1]); \ v2 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v0, v1)); \ v0 = _mm_loadl_epi64((__m128i*)&image[(y + y0) * stride + x0]); \ v1 = _mm_loadl_epi64((__m128i*)&image[(y + y1) * stride + x1]); \ v3 = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v0, v1)); \ sumA = _mm256_add_epi16(sumA, v2); \ sumB = _mm256_add_epi16(sumB, v3); \ sumAA = _mm256_add_epi32(sumAA, _mm256_madd_epi16(v2, v2)); \ sumAB = _mm256_add_epi32(sumAB, _mm256_madd_epi16(v2, v3)); v0 = _mm_loadl_epi64((__m128i*)&image2[y * stride]); v1 = _mm_loadl_epi64((__m128i*)&image[y * stride]); sumA = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v0, v0)); sumB = _mm256_cvtepu8_epi16(_mm_unpacklo_epi8(v1, v1)); sumAA = _mm256_madd_epi16(sumA, sumA); sumAB = _mm256_madd_epi16(sumA, sumB); M1(0, -1, -1, 0) M1(1, 0, 0, 1) sumA = _mm256_add_epi16(sumA, sumA); sumAA = _mm256_add_epi32(sumAA, sumAA); sumB = _mm256_add_epi16(sumB, sumB); sumAB = _mm256_add_epi32(sumAB, sumAB); M1(-1, -1, 1, -1) M1(-1, 1, 1, 1) #undef M1 v3 = _mm256_set1_epi16(1); v2 = _mm256_madd_epi16(sumA, v3); sumAA = _mm256_slli_epi32(sumAA, 4); v3 = _mm256_madd_epi16(sumB, v3); sumAB = _mm256_slli_epi32(sumAB, 4); sumAA = _mm256_sub_epi32(sumAA, _mm256_mullo_epi32(v2, v2)); sumAB = _mm256_sub_epi32(sumAB, _mm256_mullo_epi32(v2, v3)); v4 = _mm256_cmpeq_epi32(sumAA, _mm256_setzero_si256()); sumAB = _mm256_andnot_si256(v4, sumAB); scale = _mm256_cvtepi32_ps(_mm256_or_si256(sumAA, v4)); scale = _mm256_div_ps(_mm256_cvtepi32_ps(sumAB), scale); scale = _mm256_max_ps(scale, _mm256_set1_ps(-16.0f)); scale = _mm256_min_ps(scale, _mm256_set1_ps(16.0f)); v5 = scale; _mm256_storeu_ps(fbuf + y * n, v5); } #elif 1 && defined(USE_SSE2) for (y = 0; y < y1; y++) { __m128i v0, v1, v2, v3, v4, sumA, sumB, sumAA1, sumAB1, sumAA2, sumAB2; __m128 v5, scale; #define M1(x0, y0, x1, y1) \ v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[(y + y0) * stride + x0])); \ v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[(y + y1) * stride + x1])); \ v2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[(y + y0) * stride + x0])); \ v3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[(y + y1) * stride + x1])); \ sumA = _mm_add_epi16(_mm_add_epi16(sumA, v0), v1); \ sumB = _mm_add_epi16(_mm_add_epi16(sumB, v2), v3); \ v4 = _mm_unpacklo_epi16(v0, v1); sumAA1 = _mm_add_epi32(sumAA1, _mm_madd_epi16(v4, v4)); \ v1 = _mm_unpackhi_epi16(v0, v1); sumAA2 = _mm_add_epi32(sumAA2, _mm_madd_epi16(v1, v1)); \ sumAB1 = _mm_add_epi32(sumAB1, _mm_madd_epi16(v4, _mm_unpacklo_epi16(v2, v3))); \ sumAB2 = _mm_add_epi32(sumAB2, _mm_madd_epi16(v1, _mm_unpackhi_epi16(v2, v3))); v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[y * stride])); v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[y * stride])); v2 = _mm_unpacklo_epi16(v0, v0); sumAA1 = _mm_madd_epi16(v2, v2); v3 = _mm_unpacklo_epi16(v1, v1); sumAB1 = _mm_madd_epi16(v2, v3); v2 = _mm_unpackhi_epi16(v0, v0); sumAA2 = _mm_madd_epi16(v2, v2); v3 = _mm_unpackhi_epi16(v1, v1); sumAB2 = _mm_madd_epi16(v2, v3); sumA = _mm_add_epi16(v0, v0); sumB = _mm_add_epi16(v1, v1); M1(0, -1, -1, 0) M1(1, 0, 0, 1) sumA = _mm_add_epi16(sumA, sumA); sumB = _mm_add_epi16(sumB, sumB); sumAA1 = _mm_add_epi32(sumAA1, sumAA1); sumAA2 = _mm_add_epi32(sumAA2, sumAA2); sumAB1 = _mm_add_epi32(sumAB1, sumAB1); sumAB2 = _mm_add_epi32(sumAB2, sumAB2); M1(-1, -1, 1, -1) M1(-1, 1, 1, 1) #undef M1 v0 = _mm_setzero_si128(); #define M1(lo, sumAA, sumAB, x) \ v2 = _mm_unpack##lo##_epi16(sumA, v0); sumAA = _mm_slli_epi32(sumAA, 4); \ v3 = _mm_unpack##lo##_epi16(sumB, v0); sumAB = _mm_slli_epi32(sumAB, 4); \ sumAA = _mm_sub_epi32(sumAA, _mm_mullo_epi32(v2, v2)); \ sumAB = _mm_sub_epi32(sumAB, _mm_mullo_epi32(v2, v3)); \ v4 = _mm_cmpeq_epi32(sumAA, v0); sumAB = _mm_andnot_si128(v4, sumAB); \ scale = _mm_cvtepi32_ps(_mm_or_si128(sumAA, v4)); \ scale = _mm_div_ps(_mm_cvtepi32_ps(sumAB), scale); \ scale = _mm_max_ps(scale, _mm_set1_ps(-16.0f)); \ scale = _mm_min_ps(scale, _mm_set1_ps(16.0f)); \ v5 = scale; \ _mm_storeu_ps(fbuf + y * n + x, v5); M1(lo, sumAA1, sumAB1, 0) M1(hi, sumAA2, sumAB2, 4) #undef M1 } #else for (y = 0; y < y1; y++) for (x = 0; x < n; x++) { float sumA = 0, sumB = 0, sumAA = 0, sumAB = 0; float divN = 1.0f / 16, scale; #define M1(xx, yy) { \ float a = image2[(y + yy) * stride + x + xx]; \ float b = image[(y + yy) * stride + x + xx]; \ sumA += a; sumAA += a * a; \ sumB += b; sumAB += a * b; } #define M2 sumA += sumA; sumB += sumB; \ sumAA += sumAA; sumAB += sumAB; M1(0, 0) M2 M1(0, -1) M1(-1, 0) M1(1, 0) M1(0, 1) M2 M1(-1, -1) M1(1, -1) M1(-1, 1) M1(1, 1) #undef M2 #undef M1 scale = sumAA - sumA * divN * sumA; if (scale != 0.0f) scale = (sumAB - sumA * divN * sumB) / scale; scale = scale < -16.0f ? -16.0f : scale; scale = scale > 16.0f ? 16.0f : scale; // offset = (sumB - scale * sumA) * divN; fbuf[y * n + x] = scale; } #endif // faster case for 4:2:0 if (1 && !((ws - 2) | (hs - 2))) #if 1 && defined(USE_NEON) for (y = 0; y < y1; y++) { int16x8_t v0, v1, v4, v5, v6; float32x4x2_t q0, q1; float32x4_t f0, f1, f2, f3; v0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(&image[y * stride]))); v1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(&image2[y * stride]))); #define M3(low, x) \ f2 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v0))); \ f3 = vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v1))); \ f0 = vld1q_f32(&fbuf[y * n + x]); \ f3 = vaddq_f32(vmlsq_f32(f2, f3, f0), vdupq_n_f32(0.5f)); \ q0 = vzipq_f32(f0, f0); q1 = vzipq_f32(f3, f3); \ M2(v6, y * 2, x) M2(v4, y * 2 + 1, x) \ vst1_u8(&out[y * 2 * st + x * 2], vqmovun_s16(v6)); \ vst1_u8(&out[y * 2 * st + st + x * 2], vqmovun_s16(v4)); #define M1(f0, i, low) \ f0 = vmlaq_f32(q1.val[i], q0.val[i], vcvtq_f32_s32(vmovl_s16(vget_##low##_s16(v5)))); #define M2(v4, y, x) \ v5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(&p1[(y) * stride1 + x * 2]))); \ M1(f0, 0, low) M1(f1, 1, high) \ v4 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(f0)), vmovn_s32(vcvtq_s32_f32(f1))); M3(low, 0) M3(high, 4) #undef M3 #undef M2 #undef M1 } #elif 1 && defined(USE_AVX2) for (y = 0; y < y1; y++) { __m128i v0, v1; __m256i v4, v5, v6; __m256 s0, s1, f0, f2, f3; v0 = _mm_loadl_epi64((__m128i*)&image[y * stride]); v1 = _mm_loadl_epi64((__m128i*)&image2[y * stride]); f2 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(v0)); f3 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(v1)); s1 = _mm256_loadu_ps(&fbuf[y * n]); f3 = _mm256_sub_ps(f2, _mm256_mul_ps(f3, s1)); f3 = _mm256_add_ps(f3, _mm256_set1_ps(0.5f)); s1 = _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(s1), 0xd8)); f3 = _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(f3), 0xd8)); s0 = _mm256_unpacklo_ps(s1, s1); s1 = _mm256_unpackhi_ps(s1, s1); f2 = _mm256_unpacklo_ps(f3, f3); f3 = _mm256_unpackhi_ps(f3, f3); #define M1(v4, s0, f2, v0) \ f0 = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(v0)); \ v4 = _mm256_cvttps_epi32(_mm256_fmadd_ps(f0, s0, f2)); #define M2(v4, y) \ v0 = _mm_loadu_si128((__m128i*)&p1[(y) * stride1]); \ M1(v5, s0, f2, v0) M1(v4, s1, f3, _mm_bsrli_si128(v0, 8)) \ v4 = _mm256_packs_epi32(v5, v4); M2(v6, y * 2) M2(v4, y * 2 + 1) #undef M2 #undef M1 v4 = _mm256_packus_epi16(v6, v4); v0 = _mm256_castsi256_si128(v4); v1 = _mm256_extractf128_si256(v4, 1); _mm_storeu_si128((__m128i*)&out[y * 2 * st], _mm_unpacklo_epi32(v0, v1)); _mm_storeu_si128((__m128i*)&out[y * 2 * st + st], _mm_unpackhi_epi32(v0, v1)); } #elif 1 && defined(USE_SSE2) for (y = 0; y < y1; y++) { __m128i v0, v1, v4, v5, v6, v7 = _mm_setzero_si128(); __m128 s0, s1, f0, f1, f2, f3; v0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image[y * stride])); v1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&image2[y * stride])); #define M3(lo, x) \ f2 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v0, v7)); \ f3 = _mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v1, v7)); \ s1 = _mm_loadu_ps(&fbuf[y * n + x]); \ f3 = _mm_sub_ps(f2, _mm_mul_ps(f3, s1)); \ f3 = _mm_add_ps(f3, _mm_set1_ps(0.5f)); \ s0 = _mm_unpacklo_ps(s1, s1); s1 = _mm_unpackhi_ps(s1, s1); \ f2 = _mm_unpacklo_ps(f3, f3); f3 = _mm_unpackhi_ps(f3, f3); \ M2(v6, y * 2, x) M2(v4, y * 2 + 1, x) \ v4 = _mm_packus_epi16(v6, v4); \ _mm_storel_epi64((__m128i*)&out[y * 2 * st + x * 2], v4); \ _mm_storel_epi64((__m128i*)&out[y * 2 * st + st + x * 2], _mm_bsrli_si128(v4, 8)); #define M1(f0, s0, f2, lo) \ f0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpack##lo##_epi16(v5, v7)), s0), f2); #define M2(v4, y, x) \ v5 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)&p1[(y) * stride1 + x * 2])); \ M1(f0, s0, f2, lo) M1(f1, s1, f3, hi) \ v4 = _mm_packs_epi32(_mm_cvttps_epi32(f0), _mm_cvttps_epi32(f1)); M3(lo, 0) M3(hi, 4) #undef M3 #undef M2 #undef M1 } #else for (y = 0; y < y1; y++) for (x = 0; x < n; x++) { int a0, a1, a2, a3; float scale = fbuf[y * n + x], offset; offset = image[y * stride + x] - image2[y * stride + x] * scale + 0.5f; #define M1(a, xx, yy) \ a = p1[(y * hs + yy) * stride1 + x * ws + xx] * scale + offset; \ a = a < 0 ? 0 : a > MAXJSAMPLE ? MAXJSAMPLE : a; M1(a0, 0, 0) M1(a1, 1, 0) M1(a2, 0, 1) M1(a3, 1, 1) #undef M1 #define M1(a, xx, yy) out[(y * hs + yy) * st + x * ws + xx] = a; M1(a0, 0, 0) M1(a1, 1, 0) M1(a2, 0, 1) M1(a3, 1, 1) #undef M1 } #endif else for (y = 0; y < y1; y++) for (x = 0; x < n; x++) { int xx, yy, a; float scale = fbuf[y * n + x], offset; offset = image[y * stride + x] - image2[y * stride + x] * scale + 0.5f; for (yy = 0; yy < hs; yy++) for (xx = 0; xx < ws; xx++) { a = p1[(y * hs + yy) * stride1 + x * ws + xx] * scale + offset; out[(y * hs + yy) * st + x * ws + xx] = a < 0 ? 0 : a > MAXJSAMPLE ? MAXJSAMPLE : a; } } } for (yy = y0 * hs; yy < y1 * hs; yy++) { int x, a = mem[yy * st + w1 * ws - 1]; for (x = w1 * ws; x < ww; x++) mem[yy * st + x] = a; } } //#define PRECISE_PROGRESS #ifndef PROGRESS_PTR #define PROGRESS_PTR opts->progress #endif #ifndef QS_NAME #define QS_NAME do_quantsmooth #endif JPEGQS_ATTR int QS_NAME(j_decompress_ptr srcinfo, jvirt_barray_ptr *coef_arrays, jpegqs_control_t *opts) { JDIMENSION comp_width, comp_height, blk_y; int i, ci, stride, iter, stride1 = 0, need_downsample = 0; jpeg_component_info *compptr; int64_t size; JQUANT_TBL *qtbl; JSAMPLE *image, *image1 = NULL, *image2 = NULL; int num_iter = opts->niter, old_threads = -1; int prog_next = 0, prog_max = 0, prog_thr = 0, prog_prec = opts->progprec; #ifdef PRECISE_PROGRESS volatile int stop = 0; #else int stop = 0; #endif jvirt_barray_ptr coef_up[2] = { NULL, NULL }; float **tables = NULL; #ifdef WITH_LOG int64_t time = 0; if (opts->flags & JPEGQS_INFO_COMP1) for (ci = 0; ci < srcinfo->num_components; ci++) { compptr = srcinfo->comp_info + ci; i = compptr->quant_tbl_no; logfmt("component[%i] : table %i, samp %ix%i\n", ci, i, compptr->h_samp_factor, compptr->v_samp_factor); } if (opts->flags & JPEGQS_INFO_QUANT) for (i = 0; i < NUM_QUANT_TBLS; i++) { int x, y; qtbl = srcinfo->quant_tbl_ptrs[i]; if (!qtbl) continue; logfmt("quant[%i]:\n", i); for (y = 0; y < DCTSIZE; y++) { for (x = 0; x < DCTSIZE; x++) logfmt("%04x ", qtbl->quantval[y * DCTSIZE + x]); logfmt("\n"); } } if (opts->flags & JPEGQS_INFO_TIME) time = get_time_usec(); #endif compptr = srcinfo->comp_info; if (opts->flags & (JPEGQS_JOINT_YUV | JPEGQS_UPSAMPLE_UV) && srcinfo->jpeg_color_space == JCS_YCbCr && !((compptr[1].h_samp_factor - 1) | (compptr[1].v_samp_factor - 1) | (compptr[2].h_samp_factor - 1) | (compptr[2].v_samp_factor - 1))) { need_downsample = 1; } if (num_iter < 0) num_iter = 0; if (num_iter > JPEGQS_ITER_MAX) num_iter = JPEGQS_ITER_MAX; if (num_iter <= 0 && !(opts->flags & JPEGQS_UPSAMPLE_UV && need_downsample)) return 0; range_limit_init(); if (!(opts->flags & JPEGQS_LOW_QUALITY)) { tables = quantsmooth_init(opts->flags); if (!tables) return 0; } (void)old_threads; #ifdef _OPENMP if (opts->threads >= 0) { old_threads = omp_get_max_threads(); omp_set_num_threads(opts->threads ? opts->threads : omp_get_num_procs()); } #endif if (opts->progress) { for (ci = 0; ci < srcinfo->num_components; ci++) { compptr = srcinfo->comp_info + ci; prog_max += compptr->height_in_blocks * compptr->v_samp_factor * num_iter; } if (prog_prec == 0) prog_prec = 20; if (prog_prec < 0) prog_prec = prog_max; prog_thr = (unsigned)(prog_max + prog_prec - 1) / (unsigned)prog_prec; } for (ci = 0; ci < srcinfo->num_components; ci++) { int extra_refresh = 0, num_iter2 = num_iter; int prog_cur = prog_next, prog_inc; compptr = srcinfo->comp_info + ci; comp_width = compptr->width_in_blocks; comp_height = compptr->height_in_blocks; prog_inc = compptr->v_samp_factor; prog_next += comp_height * prog_inc * num_iter; if (!(qtbl = compptr->quant_table)) continue; if (image1 || (!ci && need_downsample)) extra_refresh = 1; // skip if already processed { int val = 0; for (i = 0; i < DCTSIZE2; i++) val |= qtbl->quantval[i]; if (val <= 1) num_iter2 = 0; } if (num_iter2 + extra_refresh == 0) continue; image = NULL; if (!stop) { // keeping block pointers aligned stride = comp_width * DCTSIZE + 8; size = ((int64_t)(comp_height * DCTSIZE + 2) * stride + 8) * sizeof(JSAMPLE); if (size == (int64_t)(size_t)size) image = (JSAMPLE*)malloc(size); } if (!image) { #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, coef_arrays[ci], blk_y, 1, TRUE); for (blk_x = 0; blk_x < comp_width; blk_x++) { JCOEFPTR coef = buffer[0][blk_x]; int i; for (i = 0; i < DCTSIZE2; i++) coef[i] *= qtbl->quantval[i]; } } continue; } image += 7; #ifdef WITH_LOG if (opts->flags & JPEGQS_INFO_COMP2) logfmt("component[%i] : size %ix%i\n", ci, comp_width, comp_height); #endif #define IMAGEPTR (blk_y * DCTSIZE + 1) * stride + blk_x * DCTSIZE + 1 #ifdef USE_JSIMD JSAMPROW output_buf[DCTSIZE]; for (i = 0; i < DCTSIZE; i++) output_buf[i] = image + i * stride; #endif for (iter = 0; iter < num_iter2 + extra_refresh; iter++) { #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, coef_arrays[ci], blk_y, 1, TRUE); for (blk_x = 0; blk_x < comp_width; blk_x++) { JCOEFPTR coef = buffer[0][blk_x]; int i; if (!iter) for (i = 0; i < DCTSIZE2; i++) coef[i] *= qtbl->quantval[i]; #ifdef USE_JSIMD int output_col = IMAGEPTR; #endif idct_islow(coef, image + IMAGEPTR, stride); } } { int y, w = comp_width * DCTSIZE, h = comp_height * DCTSIZE; for (y = 1; y < h + 1; y++) { image[y * stride] = image[y * stride + 1]; image[y * stride + w + 1] = image[y * stride + w]; } memcpy(image, image + stride, stride * sizeof(JSAMPLE)); memcpy(image + (h + 1) * stride, image + h * stride, stride * sizeof(JSAMPLE)); } if (iter == num_iter2) break; #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, coef_arrays[ci], blk_y, 1, TRUE); #ifdef PRECISE_PROGRESS if (stop) continue; #endif for (blk_x = 0; blk_x < comp_width; blk_x++) { JSAMPLE *p2 = image2 && opts->flags & JPEGQS_JOINT_YUV ? image2 + IMAGEPTR : NULL; JCOEFPTR coef = buffer[0][blk_x]; quantsmooth_block(coef, qtbl->quantval, image + IMAGEPTR, p2, stride, opts->flags, tables, !ci || srcinfo->jpeg_color_space != JCS_YCbCr); } #ifdef PRECISE_PROGRESS if (opts->progress) { int cur = __sync_add_and_fetch(&prog_cur, prog_inc); if (cur >= prog_thr && omp_get_thread_num() == 0) { cur = (int64_t)prog_prec * cur / prog_max; prog_thr = ((int64_t)(cur + 1) * prog_max + prog_prec - 1) / prog_prec; stop = PROGRESS_PTR(opts->userdata, cur, prog_prec); } } #endif } #ifdef PRECISE_PROGRESS if (stop) break; #else if (opts->progress) { int cur = prog_cur += comp_height * prog_inc; if (cur >= prog_thr) { cur = (int64_t)prog_prec * cur / prog_max; prog_thr = ((int64_t)(cur + 1) * prog_max + prog_prec - 1) / prog_prec; stop = PROGRESS_PTR(opts->userdata, cur, prog_prec); } if (stop) break; } #endif } // iter if (!stop && image1) { JSAMPLE *mem; int st, w1, h1, h2, ws, hs, ww, hh; compptr = srcinfo->comp_info; ws = compptr[0].h_samp_factor; hs = compptr[0].v_samp_factor; w1 = (srcinfo->image_width + ws - 1) / ws; h1 = (srcinfo->image_height + hs - 1) / hs; comp_width = compptr[0].width_in_blocks; comp_height = compptr[0].height_in_blocks; coef_up[ci - 1] = (*srcinfo->mem->request_virt_barray) ((j_common_ptr)srcinfo, JPOOL_IMAGE, FALSE, comp_width, comp_height, 1); (*srcinfo->mem->realize_virt_arrays) ((j_common_ptr)srcinfo); #ifdef _OPENMP // need to suppress JERR_BAD_VIRTUAL_ACCESS for (blk_y = 0; blk_y < comp_height; blk_y++) { (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, coef_up[ci - 1], blk_y, 1, TRUE); } #endif ww = comp_width * DCTSIZE; hh = comp_height * DCTSIZE; st = ((w1 + DCTSIZE) & -DCTSIZE) * ws; h2 = ((h1 + DCTSIZE) & -DCTSIZE) * hs; size = (int64_t)h2 * st * sizeof(JSAMPLE); mem = (JSAMPLE*)(size == (int64_t)(size_t)size ? malloc(size) : NULL); if (mem) { int y; #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (y = 0; y < h1; y += DCTSIZE) { int y1 = y + DCTSIZE; y1 = y1 < h1 ? y1 : h1; upsample_row(w1, y, y1, image, image2, stride, image1, stride1, mem, st, ww, ws, hs); } for (y = h1 * hs; y < hh; y++) memcpy(mem + y * st, mem + (h1 * hs - 1) * st, st * sizeof(JSAMPLE)); #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (blk_y = 0; blk_y < comp_height; blk_y++) { JDIMENSION blk_x; JBLOCKARRAY buffer = (*srcinfo->mem->access_virt_barray) ((j_common_ptr)srcinfo, coef_up[ci - 1], blk_y, 1, TRUE); for (blk_x = 0; blk_x < comp_width; blk_x++) { float ALIGN(32) buf[DCTSIZE2]; int x, y, n = DCTSIZE; JSAMPLE *p = mem + blk_y * n * st + blk_x * n; JCOEFPTR coef = buffer[0][blk_x]; for (y = 0; y < n; y++) for (x = 0; x < n; x++) buf[y * n + x] = p[y * st + x] - CENTERJSAMPLE; fdct_float(buf, buf); for (x = 0; x < n * n; x++) coef[x] = roundf(buf[x]); } } free(mem); } } else if (!stop && !ci && need_downsample) do { // make downsampled copy of Y component int y, w, h, w1, h1, st, ws, hs; ws = compptr[0].h_samp_factor; hs = compptr[0].v_samp_factor; if ((ws - 1) | (hs - 1)) { if (opts->flags & JPEGQS_UPSAMPLE_UV) { image1 = image; stride1 = stride; } } else { image2 = image; break; } w = compptr[1].width_in_blocks * DCTSIZE; h = compptr[1].height_in_blocks * DCTSIZE; st = w + 8; size = ((int64_t)(h + 2) * st + 8) * sizeof(JSAMPLE); image2 = (JSAMPLE*)(size == (int64_t)(size_t)size ? malloc(size) : NULL); if (!image2) break; image2 += 7; w1 = (comp_width * DCTSIZE + ws - 1) / ws; h1 = (comp_height * DCTSIZE + hs - 1) / hs; // faster case for 4:2:0 if (1 && !((ws - 2) | (hs - 2))) { #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (y = 0; y < h1; y++) { int x; for (x = 0; x < w1; x++) { JSAMPLE *p = image + (y * 2 + 1) * stride + x * 2 + 1; int a = p[0] + p[1] + p[stride] + p[stride + 1]; image2[(y + 1) * st + x + 1] = (a + 2) >> 2; } } } else { #ifdef _OPENMP #pragma omp parallel for schedule(dynamic) #endif for (y = 0; y < h1; y++) { int x, h2 = comp_height * DCTSIZE - y * hs; h2 = h2 < hs ? h2 : hs; for (x = 0; x < w1; x++) { JSAMPLE *p = image + (y * hs + 1) * stride + x * ws + 1; int xx, yy, sum = 0, w2 = comp_width * DCTSIZE - x * ws, div; w2 = w2 < ws ? w2 : ws; div = w2 * h2; for (yy = 0; yy < h2; yy++) for (xx = 0; xx < w2; xx++) sum += p[yy * stride + xx]; image2[(y + 1) * st + x + 1] = (sum + div / 2) / div; } } } for (y = 1; y < h1 + 1; y++) { int x; JSAMPLE a = image2[y * st + w1]; image2[y * st] = image2[y * st + 1]; for (x = w1 + 1; x < w + 2; x++) image2[y * st + x] = a; } memcpy(image2, image2 + st, st * sizeof(JSAMPLE)); for (y = h1 + 1; y < h + 2; y++) memcpy(image2 + y * st, image2 + h1 * st, st * sizeof(JSAMPLE)); } while (0); #undef IMAGEPTR if (image != image1 && image != image2) free(image - 7); } #ifdef WITH_LOG if (!stop && opts->flags & JPEGQS_INFO_TIME) { time = get_time_usec() - time; logfmt("quantsmooth: %.3fms\n", time * 0.001); } #endif #ifdef _OPENMP if (old_threads > 0) omp_set_num_threads(old_threads); #endif if (tables) free(tables); if (image2 != image1 && image2) free(image2 - 7); if (image1) free(image1 - 7); if (stop) image1 = NULL; if (image1) { srcinfo->max_h_samp_factor = 1; srcinfo->max_v_samp_factor = 1; compptr = srcinfo->comp_info; compptr[0].h_samp_factor = 1; compptr[0].v_samp_factor = 1; comp_width = compptr[0].width_in_blocks; comp_height = compptr[0].height_in_blocks; #define M1(i) coef_arrays[i] = coef_up[i - 1]; \ compptr[i].width_in_blocks = comp_width; \ compptr[i].height_in_blocks = comp_height; M1(1) M1(2) #undef M1 } for (ci = 0; ci < NUM_QUANT_TBLS; ci++) { qtbl = srcinfo->quant_tbl_ptrs[ci]; if (qtbl) for (i = 0; i < DCTSIZE2; i++) qtbl->quantval[i] = 1; } for (ci = 0; ci < srcinfo->num_components; ci++) { qtbl = srcinfo->comp_info[ci].quant_table; if (qtbl) for (i = 0; i < DCTSIZE2; i++) qtbl->quantval[i] = 1; } #ifndef TRANSCODE_ONLY if (!(opts->flags & JPEGQS_TRANSCODE)) { // things needed for jpeg_read_scanlines() to work correctly if (image1) { #ifdef LIBJPEG_TURBO_VERSION srcinfo->master->last_MCU_col[1] = srcinfo->master->last_MCU_col[0]; srcinfo->master->last_MCU_col[2] = srcinfo->master->last_MCU_col[0]; #endif jinit_color_deconverter(srcinfo); jinit_upsampler(srcinfo); jinit_d_main_controller(srcinfo, FALSE); srcinfo->input_iMCU_row = (srcinfo->output_height + DCTSIZE - 1) / DCTSIZE; } jinit_inverse_dct(srcinfo); } #endif return stop; } #if !defined(TRANSCODE_ONLY) && !defined(NO_HELPERS) JPEGQS_ATTR boolean jpegqs_start_decompress(j_decompress_ptr cinfo, jpegqs_control_t *opts) { boolean ret; int use_jpeqqs = opts->niter > 0 || opts->flags & JPEGQS_UPSAMPLE_UV; if (use_jpeqqs) cinfo->buffered_image = TRUE; ret = jpeg_start_decompress(cinfo); if (use_jpeqqs) { while (!jpeg_input_complete(cinfo)) { jpeg_start_output(cinfo, cinfo->input_scan_number); jpeg_finish_output(cinfo); } do_quantsmooth(cinfo, jpeg_read_coefficients(cinfo), opts); jpeg_start_output(cinfo, cinfo->input_scan_number); } return ret; } JPEGQS_ATTR boolean jpegqs_finish_decompress(j_decompress_ptr cinfo) { if ((cinfo->global_state == DSTATE_SCANNING || cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) { jpeg_finish_output(cinfo); } return jpeg_finish_decompress(cinfo); } #endif jpeg-quantsmooth-1.20210408/release.sh000066400000000000000000000014301403361770700174020ustar00rootroot00000000000000#!/bin/sh jpeg=${1:-"jpeg-6b"} bits=${2:-""} lib="-ljpeg -static" [ -d $jpeg ] && lib="-DWITH_JPEGSRC -I$jpeg $jpeg/libjpeg.a -static" # test -d winlib$bits && lib="$lib -Lwinlib$bits" omp="libgomp.a" test "$omp" && test -d winlib$bits && omp="winlib$bits/$omp" test -f ldscript$bits.txt && link="-Wl,-T,ldscript$bits.txt" || link= # make JPEGLIB="$lib" SIMD=avx2 MFLAGS="-municode" APPNAME="jpegqs${bits}_avx2" clean app # make JPEGLIB="$lib" SIMD=sse2 MFLAGS="-municode" APPNAME="jpegqs${bits}_sse2" clean app # make JPEGLIB="$lib" SIMD=none MFLAGS="-O3 -municode" APPNAME="jpegqs${bits}_none" clean app rm -f "winlib$bits/libgomp.a" make LIBMINIOMP="$omp" JPEGLIB="$lib" SIMD=select MFLAGS="-municode -fno-asynchronous-unwind-tables" APPNAME="jpegqs${bits}" LFLAGS="$link" clean all jpeg-quantsmooth-1.20210408/wasm/000077500000000000000000000000001403361770700163775ustar00rootroot00000000000000jpeg-quantsmooth-1.20210408/wasm/compile.sh000066400000000000000000000015701403361770700203660ustar00rootroot00000000000000#!/bin/bash source ../emsdk/emsdk_env.sh --build=Release debug=0 async=1 simd=0 # --pre-js shell.js emflags="-O3 -g0 --closure 1" [ $debug -ne 0 ] && emflags="-O2 -g1 -s ASSERTIONS=1" jpeg="jpeg-6b" CFLAGS_LIB="$emflags -Wno-shift-negative-value" CFLAGS_APP="-ffast-math -DWASM -DNO_SIMD $emflags -DWITH_JPEGSRC -I$jpeg -I." LFLAGS="--shell-file shell.html -s EXPORTED_FUNCTIONS=\"['_malloc', '_free']\" -s ALLOW_MEMORY_GROWTH=1" [ $async -ne 0 ] && { CFLAGS_APP="$CFLAGS_APP -DWASM_ASYNC -s ASYNCIFY=1 -s ASYNCIFY_IGNORE_INDIRECT" } [ $simd -ge 1 ] && CFLAGS_APP="$CFLAGS_APP -msimd128" [ $simd -ge 2 ] && CFLAGS_APP="$CFLAGS_APP -msse2" make JPEGSRC="$jpeg" CC="emcc" CFLAGS_LIB="$CFLAGS_LIB" CFLAGS_APP="$CFLAGS_APP" LFLAGS="$LFLAGS" APPNAME="quantsmooth.html" clean app test -f quantsmooth.html && mv quantsmooth.html index.html echo "press enter..."; read dummy < /dev/tty jpeg-quantsmooth-1.20210408/wasm/shell.html000066400000000000000000000155361403361770700204060ustar00rootroot00000000000000 JPEG Quant Smooth (WebAssembly)
Options: 
Filename: 

You need to enable JavaScript to run this app.

{{{ SCRIPT }}} jpeg-quantsmooth-1.20210408/windows/000077500000000000000000000000001403361770700171225ustar00rootroot00000000000000jpeg-quantsmooth-1.20210408/windows/Makefile000066400000000000000000000006161403361770700205650ustar00rootroot00000000000000 APPNAME ?= jpegqs_gui SRCNAME ?= jpegqs_gui.c CFLAGS := -Wall -Wextra -pedantic -O2 -mwindows CFLAGS += -DWITH_DROP CFLAGS += -DSHORTCUT_MENU LIBS := -lcomdlg32 .PHONY: clean all all: $(APPNAME) $(APPNAME): dialog.h dialog.o: dialog.rc dialog.h windres $< $@ clean: rm -f $(APPNAME) dialog.o $(APPNAME): $(SRCNAME) dialog.o $(CC) $(CFLAGS) -s -o $@ $< dialog.o $(LIBS) -Wl,--gc-sections jpeg-quantsmooth-1.20210408/windows/build.sh000066400000000000000000000002601403361770700205530ustar00rootroot00000000000000#!/bin/sh bits=${1:-""} test -f ../ldscript$bits.txt && link="-Wl,-T,../ldscript$bits.txt" || link= make APPNAME="jpegqs${bits}_gui" LFLAGS="$link" LIBNAME="$name" clean all jpeg-quantsmooth-1.20210408/windows/compile.bat000066400000000000000000000003471403361770700212460ustar00rootroot00000000000000@echo off set OLDPATH=%PATH% set PATH=C:\msys64\usr\bin;C:\msys64\mingw32\bin;%OLDPATH% C:\msys64\usr\bin\sh build.sh set PATH=C:\msys64\usr\bin;C:\msys64\mingw64\bin;%OLDPATH% C:\msys64\usr\bin\sh build.sh 64 pause jpeg-quantsmooth-1.20210408/windows/dialog.h000066400000000000000000000003231403361770700205300ustar00rootroot00000000000000 #define IDI_JPEGQS 10 #define IDD_DIALOG 100 #define IDC_LOAD 101 #define IDC_STATUS 102 #define IDC_SAVE 103 #define IDC_OPTIONS 104 #define IDC_FILENAME 105 #define IDC_CONSOLE 106 jpeg-quantsmooth-1.20210408/windows/dialog.rc000066400000000000000000000021171403361770700207100ustar00rootroot00000000000000 #include "winuser.h" #include "dialog.h" IDI_JPEGQS ICON "jpegqs.ico" #define DlgW 256 #define DlgH 196 #define BR 6 #define B2 2 #define BW 48 #define BH 14 #define LW 30 #define LH 14 IDD_DIALOG DIALOGEX 0, 0, DlgW, DlgH STYLE DS_SETFONT | DS_SHELLFONT | DS_MODALFRAME | DS_CENTER | DS_3DLOOK | WS_POPUP | WS_VISIBLE | WS_CAPTION | WS_SYSMENU CAPTION "JPEG Quant Smooth" FONT 8, "MS Shell Dlg" { PUSHBUTTON "Load", IDC_LOAD, BR, BR, BW, BH, WS_DISABLED | WS_TABSTOP LTEXT "", IDC_STATUS, BR*2+BW, BR+B2, DlgW-BW*2-BR*4, BH, ES_CENTER PUSHBUTTON "Save", IDC_SAVE, DlgW-BW-BR, BR, BW, BH, WS_DISABLED | WS_TABSTOP LTEXT "Options:", -1, BR, BR*2+BH+B2, LW, LH EDITTEXT IDC_OPTIONS, BR*2+LW, BR*2+BH, DlgW-LW-BR*3, LH, WS_BORDER | WS_TABSTOP LTEXT "Filename:", -1, BR, BR*3+BH+LH+B2, LW, LH EDITTEXT IDC_FILENAME, BR*2+LW, BR*3+BH+LH, DlgW-LW-BR*3, LH, WS_BORDER | WS_TABSTOP EDITTEXT IDC_CONSOLE, BR, BR*4+BH+LH*2, DlgW-BR*2, DlgH-BR*5-BH-LH*2, WS_BORDER | WS_TABSTOP | ES_READONLY | ES_MULTILINE | WS_VSCROLL | ES_AUTOVSCROLL | ES_AUTOHSCROLL } jpeg-quantsmooth-1.20210408/windows/jpegqs.ico000066400000000000000000000353561403361770700211230ustar00rootroot0000000000000000 %6  % h6(0` uuu!vEwwwwwxwwwwvEu!uuss/ssttyԄ Ԏ4ՕC֙M֛R֛R֙MՕCԍ4Ԅ ytttss/srrrsІ(գcضػپ¥ææ¥پػضգcІ(trqqmnpnpwώ;׳ĨĨĨĨĨĨĨĨĨĨĨĨĨĨĨĨ׳ώ;xponnmlfxxق̬نxxڇߨb˴d'G{Eu@ҽ㽍yx؀俒˪چxxޢU㼊~ xفĜҽu@{EwBզRӺې0xxێ,ʧгݚEyx؀ʧ㹄} xۍ*ҸnݘBݘBpRwCs@쪉_仈xxڇiĚгҵȢwې0yx{aĚفx~ yгلxxi_s@n=fyyxxzـممق{xxxݙBې0xx} م؀yxـ̪fn=j<h忏ډ!zxxxxxxyف^έڅyxxxx} ihj<f9줇bҴtޙCڊ$فـچݓ6aȟάVۊ#ـڄݕ:bf9a6ՙ|V׾ԹԸּ׾Ըջ|Va6]4kDkD]4X1uT*uT*X2T/E[7 ʸʹ[8 T/EP-!O-㯝ǾĺǽžxO-O-!K*K*rW4ý#! rniļ ĻURN  zqW5K*K*I)F'sH*³nkgy{olgrojž EC@-+)YVSº ³H*G'sI)B%/B%jO\ZVspkľ d`]spkjgc jOB%B%/?$>"M4Ƚ\ZWsplMKH_\Ytpl 874+*)31/ȽM4>#?#:!:!pZ]ZWtqm *)(!  vspigcigcigcigcigcpZ:":!5n=' ][Xtqmýhebmjh B@?*)(ÿ=' 4n10N;$][Ytro WTR ec`sqnljf{yuN;$01/,<+[L9^\Yurprpmxvt/.- 765/.- JHGlig [L9,,<.'`)vjZ ec`zxv KJH ('&FEC #""viZ)'`%#m'sh[~sh['#m% m RH:RH: m `;2%;2%` <  nh`nh`   <   n $b\Wb\W$  n      D@;D@;      / s 641TROjifxwu~}|xwujifTRP641  s / !EE!??( @ u uWvvvvwvvvuWu r$ruҌ3բaزؼ¤¤ؼزբaҌ3urq$mou ҜWپĩĩĩĩĩĩĩĩĩĩپҜWu omj#kˍ=ƬƬƬƬƬƬƬƬƬƬƬƬƬƬˍ=kҾj#f0jТjȯȯȯȯȯȯȯȯȯȯȯȯȯȯȯȯȯȯТjjf0b#fҪzʲʲʲʲݿܰwܭrܸɮʲݾ۩jݾʲɯܵܬqܸʲʲѪzfc#^`̢m̵̵̵ݺي(xxxx؀ڔ>yxݰv؅xxxٍ-Ǩ̴̢m`ҩ^\Bθθθݲ{zx} ڐ4ڔ:؀xx|ܠU߿و"x؁ڔ:xږ@θθB\W$a Ǭлкġ|xۚF̱к޵؁xxx޵xyǧкğxzδкǬa W$TbҾҾҾܝMxzܞPܝMو!xyݧbݣZxډ$йˮ̯ҾҾˮxxʬҾҾbTO U˲؄xxxx|ܘBĝ˭xxδԾຈڋ'x؂Կ˲UO MWx<|xÚƢȦԿxxʪ{قxx} ⼎xxxxقޜHޞLڇxxޟOߤYxقޝK~ xdo>i;چxxxx~ r܎.xxxܐ1i;b7ʥ丁}ŚĘ|śb7[3ϿϿ\4U/U0N,WfD~xRNKztjfaifaSPL[XTfDN,WI) L.a^Z DB?~zu&%$B?=fb^=;8" wsoda]{vL.I) A$uEC@olh&%$PNLuA%;!$F.EDA,*)&&$ zwsda^]ZWIGE?>=:{vkhdolh542@$0N;#ljgZXVjhefdajheN;#' UH9UI9'  80$80$ 0,($jhe~}{~}{jhe,($ 0jpeg-quantsmooth-1.20210408/windows/jpegqs_gui.c000066400000000000000000000362031403361770700214270ustar00rootroot00000000000000/* * Copyright (C) 2020 Ilya Kurdyukov * * This file is part of jpeg quantsmooth (windows GUI wrapper) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . */ #include #include #include #include #include "dialog.h" #define UNICODE #define WIN32_LEAN_AND_MEAN #include #include #ifdef WITH_DROP #include #endif #pragma GCC diagnostic ignored "-Wformat" #define FNLEN_MAX 1024 static wchar_t ofnbuf[FNLEN_MAX], sfnbuf[FNLEN_MAX]; static OPENFILENAME ofn, sfn; static const wchar_t *appname = L"JPEGQS Wrapper"; static LPCWSTR jpegqs_exe; static HWND hwndDlg = NULL; static HANDLE hErrRead = INVALID_HANDLE_VALUE; static HANDLE hOutRead = INVALID_HANDLE_VALUE; static HANDLE infoThread = INVALID_HANDLE_VALUE; static HANDLE outThread = INVALID_HANDLE_VALUE; static const wchar_t *msg_multdrop = L"Multiple file drop unsupported."; #define OPTLEN 128 #define OUTINCR (1 << 18) #define LOGINCR 16 static char *outmem = NULL; static size_t outcur, outmax; static char *logmem = NULL; static size_t logcur, logmax; static char options[OPTLEN]; static volatile int processing = 0; static void log_grow(size_t n) { if (logcur + n >= logmax) { logmax = (logcur + n + LOGINCR) & -LOGINCR; logmem = realloc(logmem, logmax); if (!logmem) return; } } static void log_update(size_t n) { if (hwndDlg) { // SetDlgItemTextA(hwndDlg, IDC_CONSOLE, logmem); // update and scroll to bottom (void)n; HWND edit = GetDlgItem(hwndDlg, IDC_CONSOLE); SendMessageA(edit, EM_SETSEL, 0, -1); SendMessageA(edit, EM_REPLACESEL, 0, (LPARAM)logmem); } } static void log_append(const char *str, int n) { if (!logmem) return; if (n < 0) n = strlen(str); log_grow(n); memcpy(logmem + logcur, str, n); logcur += n; logmem[logcur] = 0; log_update(n); } static DWORD WINAPI infoThreadProc(LPVOID lpParam) { int ret; (void)lpParam; for (;;) { DWORD rwCnt = 0; ret = PeekNamedPipe(hErrRead, NULL, 0, NULL, &rwCnt, NULL); if (!ret) break; if (!rwCnt) { Sleep(100); continue; } log_grow(rwCnt); ret = ReadFile(hErrRead, logmem + logcur, rwCnt, &rwCnt, NULL); if (!ret || !rwCnt) break; logcur += rwCnt; logmem[logcur] = 0; log_update(rwCnt); } CloseHandle(hErrRead); WaitForSingleObject(outThread, INFINITE); { char strbuf[80]; snprintf(strbuf, sizeof(strbuf), "Output size: %i\r\n", (int)outcur); log_append(strbuf, -1); } return 0; } static DWORD WINAPI outThreadProc(LPVOID lpParam) { int ret; (void)lpParam; outmem = malloc(OUTINCR); outmax = OUTINCR; outcur = 0; for (;;) { DWORD rwCnt; ret = ReadFile(hOutRead, outmem + outcur, outmax - outcur, &rwCnt, NULL); if (!ret || !rwCnt) break; outcur += rwCnt; if (outcur == outmax) { outmem = realloc(outmem, outmax + OUTINCR); outmax += OUTINCR; } } processing = 0; CloseHandle(hOutRead); SetDlgItemText(hwndDlg, IDC_STATUS, L""); if (hwndDlg) { EnableWindow(GetDlgItem(hwndDlg, IDC_LOAD), TRUE); if (outcur) EnableWindow(GetDlgItem(hwndDlg, IDC_SAVE), TRUE); } return 0; } static void closeHandles() { if (outmem) { free(outmem); outmem = NULL; } if (infoThread != INVALID_HANDLE_VALUE) { WaitForSingleObject(infoThread, INFINITE); CloseHandle(infoThread); } if (outThread != INVALID_HANDLE_VALUE) { WaitForSingleObject(outThread, INFINITE); CloseHandle(outThread); } } static int findfn(const wchar_t *path, int n) { if (n < 0) n = wcslen(path); while (n > 0) { wchar_t a = path[n - 1]; if (a == '\\' || a == '/') break; n--; } return n; } static void cbSave() { int ret; if (!processing && outmem && outcur) { if (!sfnbuf[0]) wcscpy(sfnbuf, ofnbuf); if (hwndDlg) { int n = findfn(sfnbuf, -1); GetDlgItemText(hwndDlg, IDC_FILENAME, sfnbuf + n, FNLEN_MAX - n); } ret = GetSaveFileName(&sfn); if (ret) { DWORD rwCnt; HANDLE outHandle = CreateFile(sfnbuf, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); if (outHandle != INVALID_HANDLE_VALUE) { ret = WriteFile(outHandle, outmem, outcur, &rwCnt, NULL); CloseHandle(outHandle); } } } } static void cbLoad(int use_ofn) { int ret = 1; if (processing) return; processing = 1; if (use_ofn) ret = GetOpenFileName(&ofn); if (ret) { SECURITY_ATTRIBUTES saAttr; STARTUPINFO si; PROCESS_INFORMATION pi; HANDLE hErrWrite = INVALID_HANDLE_VALUE, hOutWrite = hErrRead; DWORD threadId; closeHandles(); if (hwndDlg) { int n = findfn(ofnbuf, -1); SetDlgItemText(hwndDlg, IDC_FILENAME, ofnbuf + n); GetDlgItemTextA(hwndDlg, IDC_OPTIONS, options, OPTLEN); EnableWindow(GetDlgItem(hwndDlg, IDC_FILENAME), TRUE); EnableWindow(GetDlgItem(hwndDlg, IDC_LOAD), FALSE); EnableWindow(GetDlgItem(hwndDlg, IDC_SAVE), FALSE); SetDlgItemText(hwndDlg, IDC_STATUS, L"Processing..."); } saAttr.nLength = sizeof(SECURITY_ATTRIBUTES); saAttr.bInheritHandle = TRUE; saAttr.lpSecurityDescriptor = NULL; if (logmem) { CreatePipe(&hErrRead, &hErrWrite, &saAttr, 0); SetHandleInformation(hErrRead, HANDLE_FLAG_INHERIT, 0); } CreatePipe(&hOutRead, &hOutWrite, &saAttr, 0); SetHandleInformation(hOutRead, HANDLE_FLAG_INHERIT, 0); if (logmem) { size_t inp_size = 0; char strbuf[80]; snprintf(strbuf, sizeof(strbuf), "Loading \"%S\"\r\n", ofnbuf + findfn(ofnbuf, -1)); log_append(strbuf, -1); HANDLE hFile = CreateFile(ofnbuf, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, 0, NULL); if (hFile != INVALID_HANDLE_VALUE) { inp_size = GetFileSize(hFile, NULL); CloseHandle(hFile); } snprintf(strbuf, sizeof(strbuf), "Input size: %i\r\n", (int)inp_size); log_append(strbuf, -1); log_append("Processing...\r\n", -1); } memset(&si, 0, sizeof(si)); memset(&pi, 0, sizeof(pi)); si.cb = sizeof(si); si.dwFlags = STARTF_USESTDHANDLES; si.hStdInput = INVALID_HANDLE_VALUE; si.hStdOutput = hOutWrite; si.hStdError = hErrWrite; { wchar_t jpegqs_cmd[FNLEN_MAX + OPTLEN + 80]; snwprintf(jpegqs_cmd, sizeof(jpegqs_cmd) / sizeof(jpegqs_cmd[0]), L"\"%s\" --hwnd %i %S -- \"%s\" -", jpegqs_exe, (int)(intptr_t)hwndDlg, options, ofnbuf); ret = CreateProcess(NULL, jpegqs_cmd, NULL, NULL, TRUE, CREATE_NO_WINDOW, NULL, NULL, &si, &pi); } CloseHandle(pi.hProcess); CloseHandle(pi.hThread); CloseHandle(hErrWrite); CloseHandle(hOutWrite); if (!ret) { wchar_t strbuf[80]; snwprintf(strbuf, sizeof(strbuf) / sizeof(strbuf[0]), L"CreateProcess(\"%s\") failed with code %i\n", jpegqs_exe, GetLastError()); MessageBox(hwndDlg, strbuf, appname, MB_OK); CloseHandle(hErrRead); CloseHandle(hOutRead); SetDlgItemText(hwndDlg, IDC_STATUS, L""); EnableWindow(GetDlgItem(hwndDlg, IDC_LOAD), TRUE); } else { infoThread = CreateThread(NULL, 0, infoThreadProc, NULL, 0, &threadId); outThread = CreateThread(NULL, 0, outThreadProc, NULL, 0, &threadId); return; } } processing = 0; } INT_PTR WINAPI DialogProc(HWND hwnd, UINT uMsg, WPARAM wParam, LPARAM lParam) { (void)lParam; switch (uMsg) { case WM_INITDIALOG: { HINSTANCE hInst = GetModuleHandle(NULL); HICON hIcon = LoadIcon(hInst, MAKEINTRESOURCE(IDI_JPEGQS)); if (hIcon) SendMessage(hwnd, WM_SETICON, ICON_BIG, (LPARAM)hIcon); } hwndDlg = hwnd; ofn.hwndOwner = sfn.hwndOwner = hwnd; SetDlgItemTextA(hwnd, IDC_OPTIONS, options); SetDlgItemText(hwnd, IDC_FILENAME, L""); EnableWindow(GetDlgItem(hwnd, IDC_LOAD), TRUE); EnableWindow(GetDlgItem(hwnd, IDC_SAVE), FALSE); EnableWindow(GetDlgItem(hwnd, IDC_FILENAME), FALSE); #ifdef WITH_DROP DragAcceptFiles(hwnd, TRUE); #endif #ifdef SHORTCUT_MENU log_append("Press Ctrl+S for context menu shortcut.\r\n", -1); #endif return TRUE; case WM_COMMAND: if (HIWORD(wParam) == BN_CLICKED) { switch (LOWORD(wParam)) { case IDC_LOAD: cbLoad(1); break; case IDC_SAVE: cbSave(); break; } } break; case WM_USER: if (processing) { wchar_t buf[40]; snwprintf(buf, sizeof(buf), L"Processing: %i%%", (int)wParam); SetDlgItemText(hwnd, IDC_STATUS, buf); } return TRUE; #ifdef WITH_DROP case WM_DROPFILES: { HDROP hDrop = (HDROP)wParam; int n = DragQueryFile(hDrop, -1, NULL, 0); if (n != 1) { MessageBox(hwnd, msg_multdrop, appname, MB_OK); } else { if (DragQueryFile(hDrop, 0, ofnbuf, FNLEN_MAX)) cbLoad(0); } DragFinish(hDrop); break; } #endif case WM_DESTROY: PostQuitMessage(0); break; case WM_CLOSE: GetDlgItemTextA(hwnd, IDC_OPTIONS, options, OPTLEN); #ifdef SHORTCUT_MENU DestroyWindow(hwnd); #else EndDialog(hwnd, 0); #endif break; } return FALSE; } #ifdef SHORTCUT_MENU static void shell_menu(int action) { const char *regkey = "shell\\jpegqs", *subkey = "shell\\jpegqs\\command"; const wchar_t *menuname = L"JPEG &Quant Smooth"; const wchar_t *addmsg = L"Add Quant Smooth to shortcut menu for JPEG files?"; const wchar_t *remmsg = L"Remove Quant Smooth from shortcut menu?"; const char *types[] = { ".jpg", ".jpeg", 0 }; char links[2][80]; HKEY hKey; LSTATUS status; int i, ret; // check for redirects for (i = 0; types[i]; i++) { DWORD size = sizeof(links[0]) - 1; status = RegOpenKeyExA(HKEY_CLASSES_ROOT, types[i], 0, KEY_READ, &hKey); links[i][0] = 0; if (status == ERROR_SUCCESS) { status = RegQueryValueExA(hKey, NULL, 0, NULL, (LPBYTE)links[i], &size); if (status == ERROR_SUCCESS) links[i][size] = 0; else strcpy(links[i], types[i]); RegCloseKey(hKey); } } if (!strcmp(links[0], links[1])) links[1][0] = 0; if (action < 0) { char buf[80]; if (!links[0][0]) return; snprintf(buf, sizeof(buf), "%s\\%s", links[0], regkey); action = 0; status = RegOpenKeyExA(HKEY_CLASSES_ROOT, buf, 0, KEY_READ, &hKey); if (status == ERROR_SUCCESS) { action = 1; RegCloseKey(hKey); } } ret = MessageBox(hwndDlg, action ? remmsg : addmsg, appname, MB_YESNO | MB_ICONQUESTION); if (ret != IDYES) return; for (i = 0; types[i]; i++) { HKEY hKey1; if (!links[i][0]) continue; status = RegOpenKeyExA(HKEY_CLASSES_ROOT, links[i], 0, KEY_WRITE | KEY_QUERY_VALUE, &hKey1); if (status != ERROR_SUCCESS) continue; if (!action) { wchar_t exe[FNLEN_MAX]; int len, n = 16; len = GetModuleFileNameW(NULL, exe + 1, FNLEN_MAX - n - 1); exe[0] = '"'; len++; #define NEWKEY(name) \ status = RegCreateKeyExA(hKey1, name, 0, NULL, \ REG_OPTION_NON_VOLATILE, KEY_WRITE | KEY_QUERY_VALUE, NULL, &hKey, NULL); \ if (status == ERROR_SUCCESS) #define REGSETW(name, str) \ RegSetValueExW(hKey, name, 0, REG_SZ, (LPBYTE)str, (wcslen(str) + 1) * sizeof(wchar_t)); NEWKEY(regkey) { REGSETW(NULL, menuname) wcscpy(exe + len, L"\",0"); REGSETW(L"Icon", exe) RegCloseKey(hKey); NEWKEY(subkey) { wcscpy(exe + len, L"\" \"%1\""); REGSETW(NULL, exe) RegCloseKey(hKey); } } #undef NEWKEY #undef REGSETW } else { RegDeleteKeyA(hKey1, subkey); RegDeleteKeyA(hKey1, regkey); } RegCloseKey(hKey1); } } #endif static const TCHAR *nextarg(const TCHAR *cmd, const TCHAR **arg, int *len) { TCHAR a = 0, e = ' '; const TCHAR *s; if (cmd) do a = *cmd++; while (a == ' '); if (a == '"') { e = a; a = *cmd++; } s = cmd; while (a && a != e) a = *cmd++; *arg = s - 1; *len = cmd - s; return a ? cmd : NULL; } int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow) { (void)hInstance; (void)hPrevInstance; (void)lpCmdLine; (void)nCmdShow; const wchar_t *fnfilt = L"JPEG image (*.jpg, *.jpeg)\0*.jpg;*jpeg\0All files\0*.*\0"; const char *regkey = "Software\\JPEG Quant Smooth"; { int n = GetModuleFileNameW(NULL, ofnbuf, FNLEN_MAX); n = findfn(ofnbuf, n); if (n) { ofnbuf[n] = 0; SetCurrentDirectoryW(ofnbuf); } } memset(&ofn, 0, sizeof(ofn)); ofn.lStructSize = sizeof(ofn); ofn.lpstrFilter = fnfilt; ofn.nMaxFile = FNLEN_MAX; memcpy(&sfn, &ofn, sizeof(ofn)); ofn.lpstrFile = ofnbuf; sfn.lpstrFile = sfnbuf; ofnbuf[0] = sfnbuf[0] = 0; ofn.lpstrTitle = L"Open JPEG image"; ofn.Flags = OFN_HIDEREADONLY | OFN_FILEMUSTEXIST | OFN_PATHMUSTEXIST | OFN_EXPLORER | OFN_NOCHANGEDIR; sfn.lpstrTitle = L"Save JPEG image"; sfn.Flags = OFN_HIDEREADONLY | OFN_OVERWRITEPROMPT | OFN_PATHMUSTEXIST | OFN_EXPLORER | OFN_NOCHANGEDIR; #ifdef __x86_64__ jpegqs_exe = L"jpegqs64.exe"; #else jpegqs_exe = L"jpegqs.exe"; #endif { HKEY hKey; DWORD size = OPTLEN - 1; LSTATUS status = RegOpenKeyExA(HKEY_CURRENT_USER, regkey, 0, KEY_READ, &hKey); if (status == ERROR_SUCCESS) { status = RegQueryValueExA(hKey, "options", 0, NULL, (LPBYTE)options, &size); if (status == ERROR_SUCCESS) options[size] = 0; RegCloseKey(hKey); } if (status != ERROR_SUCCESS) strcpy(options, "--optimize --info 8 --quality 3"); } { int n1, n2; const TCHAR *cmd = GetCommandLine(), *arg1, *arg2; cmd = nextarg(cmd, &arg1, &n1); cmd = nextarg(cmd, &arg1, &n1); cmd = nextarg(cmd, &arg2, &n2); if (n2) { MessageBox(NULL, msg_multdrop, appname, MB_OK); return 0; } else if (n1) { memcpy(ofnbuf, arg1, n1 * sizeof(TCHAR)); ofnbuf[n1] = 0; } } if (ofnbuf[0]) { cbLoad(0); if (outThread != INVALID_HANDLE_VALUE) { WaitForSingleObject(outThread, INFINITE); cbSave(); } } else { logmem = malloc(LOGINCR); logmax = LOGINCR; logcur = 0; #ifdef SHORTCUT_MENU { HWND hwnd = CreateDialogParam(0, MAKEINTRESOURCE(IDD_DIALOG), NULL, (DLGPROC)DialogProc, (LPARAM)NULL); if (hwnd) { MSG msg; while (GetMessage(&msg, NULL, 0, 0)) { if (msg.message == WM_KEYDOWN) if (msg.wParam == 'S' && !(msg.lParam & (1 << 30))) if (GetAsyncKeyState(VK_CONTROL) < 0) { shell_menu(-1); continue; } if (IsDialogMessage(hwnd, &msg)) continue; TranslateMessage(&msg); DispatchMessage(&msg); } } } #else DialogBoxParam(0, MAKEINTRESOURCE(IDD_DIALOG), NULL, (DLGPROC)DialogProc, (LPARAM)NULL); #endif { HKEY hKey; LSTATUS status = RegCreateKeyExA(HKEY_CURRENT_USER, regkey, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_WRITE | KEY_QUERY_VALUE, NULL, &hKey, NULL); if (status == ERROR_SUCCESS) { RegSetValueExA(hKey, "options", 0, REG_SZ, (LPBYTE)options, strlen(options) + 1); RegCloseKey(hKey); } } } closeHandles(); if (logmem) free(logmem); return 0; }